faceberg 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,825 +0,0 @@
1
- """Tests for the bridge layer (dataset discovery, schema conversion, and TableInfo creation)."""
2
-
3
- import pytest
4
- from datasets import Features
5
- from datasets.features import ClassLabel, Sequence, Value
6
- from pyiceberg.schema import Schema
7
- from pyiceberg.types import (
8
- IntegerType,
9
- ListType,
10
- LongType,
11
- StringType,
12
- StructType,
13
- )
14
-
15
- from faceberg.bridge import (
16
- DatasetInfo,
17
- dataset_builder_safe,
18
- iceberg_schema_from_features,
19
- )
20
-
21
-
22
- def test_discover_public_dataset():
23
- """Test discovering a public HuggingFace dataset."""
24
- # Test with a known public dataset
25
- dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
26
-
27
- assert dataset_info.repo_id == "stanfordnlp/imdb"
28
- assert dataset_info.config == "plain_text"
29
-
30
- # Check splits
31
- splits = dataset_info.splits
32
- assert "train" in splits
33
- assert "test" in splits
34
- assert "unsupervised" in splits
35
-
36
- # Check Parquet files
37
- assert "train" in dataset_info.data_files
38
- train_files = dataset_info.data_files["train"]
39
- assert len(train_files) > 0
40
- assert all(isinstance(f, str) for f in train_files)
41
-
42
-
43
- def test_discover_with_specific_config():
44
- """Test discovering a dataset with a specific config."""
45
- dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
46
-
47
- assert dataset_info.config == "plain_text"
48
- assert len(dataset_info.splits) > 0
49
-
50
-
51
- def test_discover_nonexistent_dataset():
52
- """Test discovering a non-existent dataset raises ValueError."""
53
- with pytest.raises(ValueError, match="not found or not accessible"):
54
- DatasetInfo.discover("nonexistent/fake-dataset-12345", config="default")
55
-
56
-
57
- def test_discover_nonexistent_config():
58
- """Test discovering a non-existent config raises ValueError."""
59
- with pytest.raises(ValueError, match="Config .* not found"):
60
- DatasetInfo.discover("stanfordnlp/imdb", config="fake_config")
61
-
62
-
63
- def test_to_table_infos():
64
- """Test converting DatasetInfo to TableInfo objects."""
65
- dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
66
-
67
- # Convert to TableInfo
68
- table_info = dataset_info.to_table_info(
69
- namespace="default",
70
- table_name="imdb_plain_text",
71
- )
72
-
73
- assert table_info.namespace == "default"
74
- assert table_info.table_name == "imdb_plain_text"
75
- assert table_info.identifier == "default.imdb_plain_text"
76
- assert table_info.dataset_repo == "stanfordnlp/imdb"
77
- assert table_info.dataset_config == "plain_text"
78
-
79
- # Check schema
80
- assert table_info.schema is not None
81
- assert len(table_info.schema.fields) > 0
82
- # Should have split column as first field
83
- assert table_info.schema.fields[0].name == "split"
84
-
85
- # Check partition spec (should be partitioned by split)
86
- assert table_info.partition_spec is not None
87
- assert len(table_info.partition_spec.fields) == 1
88
- assert table_info.partition_spec.fields[0].name == "split"
89
-
90
- # Check files
91
- assert len(table_info.data_files) > 0
92
- for file_info in table_info.data_files:
93
- # URIs now include revision: hf://datasets/stanfordnlp/imdb@<revision>/...
94
- assert file_info.uri.startswith("hf://datasets/stanfordnlp/imdb")
95
- assert "@" in file_info.uri or "/" in file_info.uri
96
- assert file_info.split in ["train", "test", "unsupervised"]
97
-
98
- # Check properties
99
- props = table_info.get_table_properties()
100
- assert props["hf.dataset.repo"] == "stanfordnlp/imdb"
101
- assert props["hf.dataset.config"] == "plain_text"
102
-
103
-
104
- # =============================================================================
105
- # Schema Conversion Tests
106
- # =============================================================================
107
-
108
-
109
- def test_build_schema_from_simple_features():
110
- """Test building schema from simple features."""
111
- features = Features(
112
- {
113
- "text": Value("string"),
114
- "label": Value("int64"),
115
- }
116
- )
117
-
118
- schema = iceberg_schema_from_features(features, include_split_column=True)
119
-
120
- # Check split column is first
121
- assert schema.fields[0].name == "split"
122
- assert schema.fields[0].field_id == 1
123
- assert isinstance(schema.fields[0].field_type, StringType)
124
-
125
- # Check original fields
126
- assert len(schema.fields) == 3 # split + text + label
127
- field_names = [f.name for f in schema.fields]
128
- assert "text" in field_names
129
- assert "label" in field_names
130
-
131
-
132
- def test_build_schema_without_split_column():
133
- """Test building schema without split column."""
134
- features = Features(
135
- {
136
- "id": Value("int64"),
137
- "text": Value("string"),
138
- }
139
- )
140
-
141
- schema = iceberg_schema_from_features(features, include_split_column=False)
142
-
143
- # No split column
144
- field_names = [f.name for f in schema.fields]
145
- assert "split" not in field_names
146
- assert len(schema.fields) == 2
147
-
148
-
149
- def test_build_schema_with_nested_features():
150
- """Test building schema with nested structures."""
151
- features = Features(
152
- {
153
- "id": Value("int64"),
154
- "metadata": {
155
- "title": Value("string"),
156
- "author": Value("string"),
157
- "year": Value("int32"),
158
- },
159
- "tags": Sequence(Value("string")),
160
- }
161
- )
162
-
163
- schema = iceberg_schema_from_features(features, include_split_column=False)
164
-
165
- # Verify structure
166
- field_names = [f.name for f in schema.fields]
167
- assert "id" in field_names
168
- assert "metadata" in field_names
169
- assert "tags" in field_names
170
-
171
- # Find metadata field
172
- metadata_field = next(f for f in schema.fields if f.name == "metadata")
173
- assert isinstance(metadata_field.field_type, StructType)
174
-
175
- # Find tags field
176
- tags_field = next(f for f in schema.fields if f.name == "tags")
177
- assert isinstance(tags_field.field_type, ListType)
178
-
179
-
180
- def test_build_schema_with_class_label():
181
- """Test building schema with ClassLabel feature."""
182
- features = Features(
183
- {
184
- "text": Value("string"),
185
- "label": ClassLabel(names=["negative", "positive"]),
186
- }
187
- )
188
-
189
- schema = iceberg_schema_from_features(features, include_split_column=False)
190
-
191
- # ClassLabel should be converted to an integer type
192
- label_field = next(f for f in schema.fields if f.name == "label")
193
- # ClassLabel is typically represented as int64 in Arrow
194
- assert isinstance(label_field.field_type, (IntegerType, LongType))
195
-
196
-
197
- def test_unique_field_ids():
198
- """Test that all field IDs are unique across nested structures."""
199
- features = Features(
200
- {
201
- "id": Value("int64"),
202
- "nested": {
203
- "field1": Value("string"),
204
- "field2": Value("int32"),
205
- "deeper": {
206
- "field3": Value("string"),
207
- },
208
- },
209
- "list_field": Sequence(Value("string")),
210
- }
211
- )
212
-
213
- schema = iceberg_schema_from_features(features, include_split_column=True)
214
-
215
- # Collect all field IDs recursively
216
- def collect_field_ids(field_type, ids=None):
217
- if ids is None:
218
- ids = []
219
-
220
- if isinstance(field_type, StructType):
221
- for field in field_type.fields:
222
- ids.append(field.field_id)
223
- collect_field_ids(field.field_type, ids)
224
- elif isinstance(field_type, ListType):
225
- ids.append(field_type.element_id)
226
- collect_field_ids(field_type.element_type, ids)
227
-
228
- return ids
229
-
230
- # Get all field IDs
231
- all_ids = [f.field_id for f in schema.fields]
232
- for field in schema.fields:
233
- all_ids.extend(collect_field_ids(field.field_type))
234
-
235
- # Check all IDs are unique
236
- assert len(all_ids) == len(set(all_ids)), f"Duplicate field IDs found: {all_ids}"
237
-
238
-
239
- def test_features_dict_to_features_object():
240
- """Test that dict features are properly converted to Features object."""
241
- features_dict = {
242
- "id": Value("int64"),
243
- "text": Value("string"),
244
- }
245
-
246
- schema = iceberg_schema_from_features(features_dict, include_split_column=False)
247
-
248
- # Should work the same as passing Features object
249
- assert isinstance(schema, Schema)
250
- field_names = [f.name for f in schema.fields]
251
- assert "id" in field_names
252
- assert "text" in field_names
253
-
254
-
255
- def test_dataset_builder_safe():
256
- """Test that the safe builder loader works and avoids local files."""
257
- # Test with a known public dataset
258
- builder = dataset_builder_safe("stanfordnlp/imdb", config="plain_text")
259
-
260
- assert builder is not None
261
- assert builder.info is not None
262
- assert builder.info.features is not None
263
-
264
-
265
- def test_dataset_builder_safe_nonexistent():
266
- """Test that safe builder loader raises error for non-existent dataset."""
267
- with pytest.raises(Exception):
268
- dataset_builder_safe("nonexistent/fake-dataset-12345")
269
-
270
-
271
- def test_table_properties_use_hf_prefix():
272
- """Test that table properties use hf.dataset.* prefix."""
273
- dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
274
- table_info = dataset_info.to_table_info(
275
- namespace="default",
276
- table_name="imdb_plain_text",
277
- )
278
-
279
- props = table_info.get_table_properties()
280
-
281
- # Check that properties use hf.dataset prefix
282
- assert "hf.dataset.repo" in props
283
- assert "hf.dataset.config" in props
284
- assert props["hf.dataset.repo"] == "stanfordnlp/imdb"
285
- assert props["hf.dataset.config"] == "plain_text"
286
-
287
- # Check that revision is always included (now mandatory)
288
- assert "hf.dataset.revision" in props
289
- assert props["hf.dataset.revision"] == table_info.dataset_revision
290
-
291
- # Verify old prefix is not used
292
- assert "faceberg.source.repo" not in props
293
- assert "faceberg.source.config" not in props
294
- assert "faceberg.source.revision" not in props
295
-
296
-
297
- def test_table_info_name_mapping_with_nested_structs():
298
- """Test that name mapping includes nested struct fields."""
299
- import json
300
-
301
- from pyiceberg.types import IntegerType, NestedField, StringType, StructType
302
-
303
- from faceberg.bridge import TableInfo
304
-
305
- # Create a schema with nested structs
306
- schema = Schema(
307
- NestedField(field_id=1, name="id", field_type=StringType(), required=False),
308
- NestedField(
309
- field_id=2,
310
- name="metadata",
311
- field_type=StructType(
312
- NestedField(field_id=3, name="author", field_type=StringType(), required=False),
313
- NestedField(field_id=4, name="year", field_type=IntegerType(), required=False),
314
- ),
315
- required=False,
316
- ),
317
- )
318
-
319
- from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC
320
-
321
- table_info = TableInfo(
322
- namespace="test",
323
- table_name="table",
324
- schema=schema,
325
- partition_spec=UNPARTITIONED_PARTITION_SPEC,
326
- data_files=[],
327
- data_dir="data",
328
- dataset_repo="test/repo",
329
- dataset_config="default",
330
- dataset_revision="abc123",
331
- )
332
-
333
- properties = table_info.get_table_properties()
334
- name_mapping = json.loads(properties["schema.name-mapping.default"])
335
-
336
- # Check top-level fields
337
- assert len(name_mapping) == 2
338
- assert name_mapping[0]["field-id"] == 1
339
- assert name_mapping[0]["names"] == ["id"]
340
-
341
- # Check nested struct field
342
- metadata_mapping = name_mapping[1]
343
- assert metadata_mapping["field-id"] == 2
344
- assert metadata_mapping["names"] == ["metadata"]
345
- assert "fields" in metadata_mapping
346
- assert len(metadata_mapping["fields"]) == 2
347
-
348
- # Check nested struct's child fields
349
- assert metadata_mapping["fields"][0]["field-id"] == 3
350
- assert metadata_mapping["fields"][0]["names"] == ["author"]
351
- assert metadata_mapping["fields"][1]["field-id"] == 4
352
- assert metadata_mapping["fields"][1]["names"] == ["year"]
353
-
354
-
355
- def test_table_info_name_mapping_with_lists():
356
- """Test that name mapping includes list element mappings."""
357
- import json
358
-
359
- from pyiceberg.types import ListType, NestedField, StringType, StructType
360
-
361
- from faceberg.bridge import TableInfo
362
-
363
- # Create a schema with list of strings and list of structs
364
- schema = Schema(
365
- NestedField(field_id=1, name="id", field_type=StringType(), required=False),
366
- NestedField(
367
- field_id=2,
368
- name="tags",
369
- field_type=ListType(element_id=3, element_type=StringType(), element_required=False),
370
- required=False,
371
- ),
372
- NestedField(
373
- field_id=4,
374
- name="items",
375
- field_type=ListType(
376
- element_id=5,
377
- element_type=StructType(
378
- NestedField(field_id=6, name="name", field_type=StringType(), required=False),
379
- NestedField(field_id=7, name="value", field_type=StringType(), required=False),
380
- ),
381
- element_required=False,
382
- ),
383
- required=False,
384
- ),
385
- )
386
-
387
- from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC
388
-
389
- table_info = TableInfo(
390
- namespace="test",
391
- table_name="table",
392
- schema=schema,
393
- partition_spec=UNPARTITIONED_PARTITION_SPEC,
394
- data_files=[],
395
- data_dir="data",
396
- dataset_repo="test/repo",
397
- dataset_config="default",
398
- dataset_revision="abc123",
399
- )
400
-
401
- properties = table_info.get_table_properties()
402
- name_mapping = json.loads(properties["schema.name-mapping.default"])
403
-
404
- # Check list of strings (tags)
405
- tags_mapping = name_mapping[1]
406
- assert tags_mapping["field-id"] == 2
407
- assert tags_mapping["names"] == ["tags"]
408
- assert "fields" in tags_mapping
409
- assert len(tags_mapping["fields"]) == 1
410
-
411
- # Check element mapping for simple list
412
- element_mapping = tags_mapping["fields"][0]
413
- assert element_mapping["field-id"] == 3
414
- assert element_mapping["names"] == ["element"]
415
-
416
- # Check list of structs (items)
417
- items_mapping = name_mapping[2]
418
- assert items_mapping["field-id"] == 4
419
- assert items_mapping["names"] == ["items"]
420
- assert "fields" in items_mapping
421
-
422
- # Check element mapping for list of structs
423
- items_element = items_mapping["fields"][0]
424
- assert items_element["field-id"] == 5
425
- assert items_element["names"] == ["element"]
426
- assert "fields" in items_element
427
-
428
- # Check struct fields within list element
429
- assert len(items_element["fields"]) == 2
430
- assert items_element["fields"][0]["field-id"] == 6
431
- assert items_element["fields"][0]["names"] == ["name"]
432
- assert items_element["fields"][1]["field-id"] == 7
433
- assert items_element["fields"][1]["names"] == ["value"]
434
-
435
-
436
- def test_table_info_name_mapping_with_maps():
437
- """Test that name mapping includes map key and value mappings."""
438
- import json
439
-
440
- from pyiceberg.types import IntegerType, MapType, NestedField, StringType, StructType
441
-
442
- from faceberg.bridge import TableInfo
443
-
444
- # Create a schema with a map
445
- schema = Schema(
446
- NestedField(field_id=1, name="id", field_type=StringType(), required=False),
447
- NestedField(
448
- field_id=2,
449
- name="metadata",
450
- field_type=MapType(
451
- key_id=3,
452
- key_type=StringType(),
453
- value_id=4,
454
- value_type=StructType(
455
- NestedField(field_id=5, name="count", field_type=IntegerType(), required=False),
456
- NestedField(field_id=6, name="name", field_type=StringType(), required=False),
457
- ),
458
- value_required=False,
459
- ),
460
- required=False,
461
- ),
462
- )
463
-
464
- from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC
465
-
466
- table_info = TableInfo(
467
- namespace="test",
468
- table_name="table",
469
- schema=schema,
470
- partition_spec=UNPARTITIONED_PARTITION_SPEC,
471
- data_files=[],
472
- data_dir="data",
473
- dataset_repo="test/repo",
474
- dataset_config="default",
475
- dataset_revision="abc123",
476
- )
477
-
478
- properties = table_info.get_table_properties()
479
- name_mapping = json.loads(properties["schema.name-mapping.default"])
480
-
481
- # Check map field
482
- metadata_mapping = name_mapping[1]
483
- assert metadata_mapping["field-id"] == 2
484
- assert metadata_mapping["names"] == ["metadata"]
485
- assert "fields" in metadata_mapping
486
- assert len(metadata_mapping["fields"]) == 2
487
-
488
- # Check key mapping
489
- key_mapping = metadata_mapping["fields"][0]
490
- assert key_mapping["field-id"] == 3
491
- assert key_mapping["names"] == ["key"]
492
-
493
- # Check value mapping
494
- value_mapping = metadata_mapping["fields"][1]
495
- assert value_mapping["field-id"] == 4
496
- assert value_mapping["names"] == ["value"]
497
- assert "fields" in value_mapping
498
-
499
- # Check struct fields within map value
500
- assert len(value_mapping["fields"]) == 2
501
- assert value_mapping["fields"][0]["field-id"] == 5
502
- assert value_mapping["fields"][0]["names"] == ["count"]
503
- assert value_mapping["fields"][1]["field-id"] == 6
504
- assert value_mapping["fields"][1]["names"] == ["name"]
505
-
506
-
507
- # =============================================================================
508
- # Revision Diff Tests
509
- # =============================================================================
510
-
511
-
512
- def test_dataset_new_files_no_new_files():
513
- """Test when no files were added between revisions."""
514
- from unittest.mock import Mock, patch
515
-
516
- from faceberg.bridge import dataset_new_files
517
-
518
- # Mock HfApi
519
- mock_api = Mock()
520
- mock_api.list_repo_files.return_value = [
521
- "plain_text/train-00000.parquet",
522
- "plain_text/test-00000.parquet",
523
- "README.md",
524
- ]
525
-
526
- with patch("faceberg.bridge.HfApi", return_value=mock_api):
527
- result = dataset_new_files(
528
- repo_id="test/dataset",
529
- config="plain_text",
530
- old_revision="abc123",
531
- new_revision="def456",
532
- )
533
-
534
- # Should return empty list when files are the same
535
- assert result == []
536
-
537
- # Verify API was called with both revisions
538
- assert mock_api.list_repo_files.call_count == 2
539
- calls = mock_api.list_repo_files.call_args_list
540
- assert calls[0].kwargs["revision"] == "abc123"
541
- assert calls[1].kwargs["revision"] == "def456"
542
-
543
-
544
- def test_dataset_new_files_with_new_files():
545
- """Test when new parquet files were added."""
546
- from unittest.mock import Mock, patch
547
-
548
- from faceberg.bridge import dataset_new_files
549
-
550
- # Mock HfApi
551
- mock_api = Mock()
552
-
553
- def list_files_side_effect(**kwargs):
554
- if kwargs["revision"] == "abc123":
555
- # Old revision has 2 files
556
- return [
557
- "plain_text/train-00000.parquet",
558
- "plain_text/test-00000.parquet",
559
- "README.md",
560
- ]
561
- else:
562
- # New revision has 4 files (2 new)
563
- return [
564
- "plain_text/train-00000.parquet",
565
- "plain_text/train-00001.parquet", # NEW
566
- "plain_text/test-00000.parquet",
567
- "plain_text/validation-00000.parquet", # NEW
568
- "README.md",
569
- ]
570
-
571
- mock_api.list_repo_files.side_effect = list_files_side_effect
572
-
573
- with patch("faceberg.bridge.HfApi", return_value=mock_api):
574
- result = dataset_new_files(
575
- repo_id="test/dataset",
576
- config="plain_text",
577
- old_revision="abc123",
578
- new_revision="def456",
579
- )
580
-
581
- # Should return list of new file paths
582
- assert result == [
583
- "plain_text/train-00001.parquet",
584
- "plain_text/validation-00000.parquet",
585
- ]
586
-
587
-
588
- def test_dataset_new_files_filters_by_config():
589
- """Test that only files for specified config are returned."""
590
- from unittest.mock import Mock, patch
591
-
592
- from faceberg.bridge import dataset_new_files
593
-
594
- # Mock HfApi
595
- mock_api = Mock()
596
-
597
- def list_files_side_effect(**kwargs):
598
- if kwargs["revision"] == "abc123":
599
- return ["README.md"]
600
- else:
601
- # New files in multiple configs
602
- return [
603
- "plain_text/train-00000.parquet", # Should be included
604
- "other_config/train-00000.parquet", # Should be excluded
605
- "README.md",
606
- ]
607
-
608
- mock_api.list_repo_files.side_effect = list_files_side_effect
609
-
610
- with patch("faceberg.bridge.HfApi", return_value=mock_api):
611
- result = dataset_new_files(
612
- repo_id="test/dataset",
613
- config="plain_text",
614
- old_revision="abc123",
615
- new_revision="def456",
616
- )
617
-
618
- # Should return only plain_text config file paths
619
- assert result == ["plain_text/train-00000.parquet"]
620
-
621
-
622
- def test_dataset_new_files_ignores_non_parquet():
623
- """Test that non-parquet files are filtered out."""
624
- from unittest.mock import Mock, patch
625
-
626
- from faceberg.bridge import dataset_new_files
627
-
628
- # Mock HfApi
629
- mock_api = Mock()
630
-
631
- def list_files_side_effect(**kwargs):
632
- if kwargs["revision"] == "abc123":
633
- return []
634
- else:
635
- # Mix of file types
636
- return [
637
- "plain_text/train-00000.parquet", # Should be included
638
- "plain_text/metadata.json", # Should be excluded
639
- "plain_text/dataset_info.txt", # Should be excluded
640
- "README.md", # Should be excluded
641
- ]
642
-
643
- mock_api.list_repo_files.side_effect = list_files_side_effect
644
-
645
- with patch("faceberg.bridge.HfApi", return_value=mock_api):
646
- result = dataset_new_files(
647
- repo_id="test/dataset",
648
- config="plain_text",
649
- old_revision="abc123",
650
- new_revision="def456",
651
- )
652
-
653
- # Should return only parquet file paths
654
- assert result == ["plain_text/train-00000.parquet"]
655
-
656
-
657
- def test_discover_with_since_revision():
658
- """Test that passing since_revision to discover filters to new files only."""
659
- from unittest.mock import Mock, patch
660
-
661
- from datasets.features import Value
662
-
663
- # Mock dataset_builder_safe to return a mock builder
664
- mock_builder = Mock()
665
- mock_builder.hash = "def456"
666
- mock_builder.info.features = Features(
667
- {
668
- "text": Value("string"),
669
- "label": Value("int64"),
670
- }
671
- )
672
- mock_builder.config.data_dir = None
673
- mock_builder.config.data_files = {
674
- "train": [
675
- "hf://datasets/test/dataset@def456/plain_text/train-00000.parquet",
676
- "hf://datasets/test/dataset@def456/plain_text/train-00001.parquet",
677
- ],
678
- "test": ["hf://datasets/test/dataset@def456/plain_text/test-00000.parquet"],
679
- }
680
-
681
- # Mock dataset_new_files to return list of new file paths
682
- mock_get_new_files = Mock(
683
- return_value=[
684
- "plain_text/train-00001.parquet",
685
- "plain_text/test-00000.parquet",
686
- ]
687
- )
688
-
689
- # Mock HfFileSystem to resolve file URIs
690
- mock_fs = Mock()
691
-
692
- def mock_resolve_path(uri):
693
- # Extract path from URI: "hf://datasets/test/dataset@def456/plain_text/train-00001.parquet"
694
- # Split: ['hf:', '', 'datasets', 'test', 'dataset@def456', 'plain_text',
695
- # 'train-00001.parquet']
696
- parts = uri.split("/")
697
- # Join everything after repo@revision (starting from index 5)
698
- path = "/".join(parts[5:])
699
- mock_result = Mock()
700
- mock_result.path_in_repo = path
701
- return mock_result
702
-
703
- mock_fs.resolve_path.side_effect = mock_resolve_path
704
-
705
- with (
706
- patch("faceberg.bridge.dataset_builder_safe", return_value=mock_builder),
707
- patch("faceberg.bridge.dataset_new_files", mock_get_new_files),
708
- patch("faceberg.bridge.HfFileSystem", return_value=mock_fs),
709
- ):
710
- # Discover with since_revision (should return only new files)
711
- dataset_info = DatasetInfo.discover(
712
- repo_id="test/dataset",
713
- config="plain_text",
714
- since_revision="abc123",
715
- )
716
-
717
- # Should have only 2 files (the new ones)
718
- assert len(dataset_info.splits) == 2
719
- assert "train" in dataset_info.splits
720
- assert "test" in dataset_info.splits
721
-
722
- # Verify data files are populated with new files
723
- assert "train" in dataset_info.data_files
724
- assert "test" in dataset_info.data_files
725
- assert dataset_info.data_files["train"] == [
726
- "hf://datasets/test/dataset@def456/plain_text/train-00001.parquet"
727
- ]
728
- assert dataset_info.data_files["test"] == [
729
- "hf://datasets/test/dataset@def456/plain_text/test-00000.parquet"
730
- ]
731
-
732
- # Verify dataset_new_files was called with correct args
733
- mock_get_new_files.assert_called_once_with(
734
- repo_id="test/dataset",
735
- config="plain_text",
736
- old_revision="abc123",
737
- new_revision="def456",
738
- token=None,
739
- )
740
-
741
- # Now convert to TableInfo and verify
742
- table_info = dataset_info.to_table_info(
743
- namespace="default",
744
- table_name="test_table",
745
- )
746
-
747
- # Should have only 2 files (the new ones)
748
- assert len(table_info.data_files) == 2
749
- file_paths = [f.uri for f in table_info.data_files]
750
- assert "hf://datasets/test/dataset@def456/plain_text/train-00001.parquet" in file_paths
751
- assert "hf://datasets/test/dataset@def456/plain_text/test-00000.parquet" in file_paths
752
-
753
- # Verify files are properly organized by split
754
- splits = {f.split for f in table_info.data_files}
755
- assert "train" in splits
756
- assert "test" in splits
757
-
758
-
759
- def test_features_stored_in_dataset_info():
760
- """Test that features are stored in DatasetInfo during discover()."""
761
- dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
762
-
763
- # Features should be stored in DatasetInfo
764
- assert hasattr(dataset_info, "features")
765
- assert dataset_info.features is not None
766
- assert isinstance(dataset_info.features, Features)
767
-
768
- # Features should have expected fields for this dataset
769
- assert "text" in dataset_info.features
770
- assert "label" in dataset_info.features
771
-
772
-
773
- def test_to_table_info_uses_stored_features():
774
- """Test that to_table_info uses stored features instead of calling dataset_builder_safe."""
775
- from unittest.mock import patch
776
-
777
- dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
778
-
779
- # Mock dataset_builder_safe to ensure it's NOT called
780
- with patch("faceberg.bridge.dataset_builder_safe") as mock_builder:
781
- table_info = dataset_info.to_table_info(
782
- namespace="default",
783
- table_name="imdb_plain_text",
784
- )
785
-
786
- # dataset_builder_safe should NOT have been called since features are stored
787
- mock_builder.assert_not_called()
788
-
789
- # TableInfo should still be created successfully
790
- assert table_info.schema is not None
791
- assert len(table_info.schema.fields) > 0
792
-
793
-
794
- if __name__ == "__main__":
795
- # Run basic smoke test
796
- print("Running basic discovery test...")
797
- dataset_info = DatasetInfo.discover("stanfordnlp/imdb", config="plain_text")
798
- print(f"✓ Discovered config: {dataset_info.config}")
799
- print(f"✓ Found splits: {dataset_info.splits}")
800
-
801
- # Count total parquet files
802
- total_files = sum(len(files) for files in dataset_info.data_files.values())
803
- print(f"✓ Found {total_files} Parquet files across {len(dataset_info.data_files)} splits")
804
-
805
- # Get a sample file
806
- first_split_files = next(iter(dataset_info.data_files.values()))
807
- if first_split_files:
808
- # Files are already fully qualified URIs
809
- sample = first_split_files[0]
810
- print(f"✓ Sample file: {sample}")
811
-
812
- print("\nRunning schema conversion tests...")
813
- test_build_schema_from_simple_features()
814
- print("✓ Simple features test passed")
815
-
816
- test_build_schema_without_split_column()
817
- print("✓ No split column test passed")
818
-
819
- test_build_schema_with_nested_features()
820
- print("✓ Nested features test passed")
821
-
822
- test_unique_field_ids()
823
- print("✓ Unique field IDs test passed")
824
-
825
- print("\n✓ All tests passed!")