faceberg 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faceberg/_version.py +34 -0
- faceberg/catalog.py +92 -76
- faceberg/discover.py +181 -0
- faceberg/iceberg.py +707 -0
- faceberg/tests/test_catalog.py +1 -2
- faceberg/tests/test_discover.py +257 -0
- faceberg/tests/test_iceberg.py +911 -0
- {faceberg-0.1.1.dist-info → faceberg-0.1.2.dist-info}/METADATA +9 -7
- {faceberg-0.1.1.dist-info → faceberg-0.1.2.dist-info}/RECORD +12 -11
- faceberg/bridge.py +0 -586
- faceberg/convert.py +0 -813
- faceberg/tests/test_bridge.py +0 -825
- faceberg/tests/test_convert.py +0 -422
- {faceberg-0.1.1.dist-info → faceberg-0.1.2.dist-info}/WHEEL +0 -0
- {faceberg-0.1.1.dist-info → faceberg-0.1.2.dist-info}/entry_points.txt +0 -0
- {faceberg-0.1.1.dist-info → faceberg-0.1.2.dist-info}/licenses/LICENSE +0 -0
faceberg/tests/test_catalog.py
CHANGED
|
@@ -14,6 +14,7 @@ from pyiceberg.exceptions import (
|
|
|
14
14
|
from pyiceberg.io.fsspec import FsspecFileIO
|
|
15
15
|
from pyiceberg.partitioning import PartitionField, PartitionSpec
|
|
16
16
|
from pyiceberg.schema import Schema
|
|
17
|
+
from pyiceberg.table.snapshots import Operation
|
|
17
18
|
from pyiceberg.transforms import IdentityTransform
|
|
18
19
|
from pyiceberg.types import LongType, NestedField, StringType
|
|
19
20
|
|
|
@@ -460,8 +461,6 @@ class TestTableAppend:
|
|
|
460
461
|
latest_snapshot = snapshots_after[-1]
|
|
461
462
|
assert latest_snapshot.summary is not None
|
|
462
463
|
# Summary.operation is an enum, not a string
|
|
463
|
-
from pyiceberg.table.snapshots import Operation
|
|
464
|
-
|
|
465
464
|
assert latest_snapshot.summary.operation == Operation.APPEND
|
|
466
465
|
|
|
467
466
|
def test_append_data_partition_integrity(self, writable_catalog):
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# tests/test_discover.py
|
|
2
|
+
import os
|
|
3
|
+
from unittest.mock import MagicMock, Mock, patch
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from datasets import Features, Value
|
|
7
|
+
|
|
8
|
+
from faceberg.discover import DatasetInfo, ParquetFile, dataset_builder_safe, discover_dataset
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_parquet_file_creation():
|
|
12
|
+
"""Test creating a ParquetFile with all fields."""
|
|
13
|
+
pf = ParquetFile(
|
|
14
|
+
uri="hf://datasets/squad@abc123/train-00000.parquet",
|
|
15
|
+
path="train-00000.parquet",
|
|
16
|
+
size=1024,
|
|
17
|
+
blob_id="abc123def456",
|
|
18
|
+
split="train",
|
|
19
|
+
)
|
|
20
|
+
assert pf.uri == "hf://datasets/squad@abc123/train-00000.parquet"
|
|
21
|
+
assert pf.path == "train-00000.parquet"
|
|
22
|
+
assert pf.size == 1024
|
|
23
|
+
assert pf.blob_id == "abc123def456"
|
|
24
|
+
assert pf.split == "train"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_parquet_file_optional_split():
|
|
28
|
+
"""Test ParquetFile with optional split."""
|
|
29
|
+
pf = ParquetFile(
|
|
30
|
+
uri="hf://datasets/squad@abc123/data.parquet",
|
|
31
|
+
path="data.parquet",
|
|
32
|
+
size=2048,
|
|
33
|
+
blob_id="xyz789",
|
|
34
|
+
split=None,
|
|
35
|
+
)
|
|
36
|
+
assert pf.split is None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_dataset_info_creation():
|
|
40
|
+
"""Test creating a DatasetInfo with all fields."""
|
|
41
|
+
features = Features({"text": Value("string")})
|
|
42
|
+
files = [
|
|
43
|
+
ParquetFile(
|
|
44
|
+
uri="hf://datasets/squad@abc123/train-00000.parquet",
|
|
45
|
+
path="train-00000.parquet",
|
|
46
|
+
size=1024,
|
|
47
|
+
blob_id="blob1",
|
|
48
|
+
split="train",
|
|
49
|
+
)
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
info = DatasetInfo(
|
|
53
|
+
repo_id="squad",
|
|
54
|
+
config="plain_text",
|
|
55
|
+
revision="abc123",
|
|
56
|
+
features=features,
|
|
57
|
+
splits=["train", "test"],
|
|
58
|
+
data_dir="data",
|
|
59
|
+
files=files,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
assert info.repo_id == "squad"
|
|
63
|
+
assert info.config == "plain_text"
|
|
64
|
+
assert info.revision == "abc123"
|
|
65
|
+
assert info.features == features
|
|
66
|
+
assert info.splits == ["train", "test"]
|
|
67
|
+
assert info.data_dir == "data"
|
|
68
|
+
assert len(info.files) == 1
|
|
69
|
+
assert info.files[0].uri == "hf://datasets/squad@abc123/train-00000.parquet"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_dataset_builder_safe_changes_directory(tmp_path):
|
|
73
|
+
"""Test that dataset_builder_safe changes to temp directory."""
|
|
74
|
+
original_cwd = os.getcwd()
|
|
75
|
+
|
|
76
|
+
with patch("faceberg.discover.load_dataset_builder") as mock_load:
|
|
77
|
+
mock_builder = MagicMock()
|
|
78
|
+
mock_load.return_value = mock_builder
|
|
79
|
+
|
|
80
|
+
result = dataset_builder_safe("squad", "plain_text", token="test_token")
|
|
81
|
+
|
|
82
|
+
# Should be back in original directory
|
|
83
|
+
assert os.getcwd() == original_cwd
|
|
84
|
+
assert result == mock_builder
|
|
85
|
+
mock_load.assert_called_once_with("squad", "plain_text", token="test_token")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_dataset_builder_safe_restores_directory_on_error(tmp_path):
|
|
89
|
+
"""Test that directory is restored even on error."""
|
|
90
|
+
original_cwd = os.getcwd()
|
|
91
|
+
|
|
92
|
+
with patch("faceberg.discover.load_dataset_builder") as mock_load:
|
|
93
|
+
mock_load.side_effect = Exception("Load failed")
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
dataset_builder_safe("squad", "plain_text")
|
|
97
|
+
except Exception:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
# Should be back in original directory even after error
|
|
101
|
+
assert os.getcwd() == original_cwd
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_discover_dataset_basic():
|
|
105
|
+
"""Test basic dataset discovery with mocked APIs."""
|
|
106
|
+
# Mock builder
|
|
107
|
+
mock_builder = MagicMock()
|
|
108
|
+
mock_builder.hash = "abc123def456"
|
|
109
|
+
mock_builder.info.features = Features({"text": Value("string")})
|
|
110
|
+
mock_builder.config.data_files = {
|
|
111
|
+
"train": ["hf://datasets/squad@abc123def456/data/train-00000.parquet"],
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
# Mock HuggingFace API response
|
|
115
|
+
mock_sibling = MagicMock()
|
|
116
|
+
mock_sibling.rfilename = "data/train-00000.parquet"
|
|
117
|
+
mock_sibling.size = 1024
|
|
118
|
+
mock_sibling.blob_id = "blob123"
|
|
119
|
+
|
|
120
|
+
mock_dataset_info = MagicMock()
|
|
121
|
+
mock_dataset_info.siblings = [mock_sibling]
|
|
122
|
+
|
|
123
|
+
with patch("faceberg.discover.dataset_builder_safe", return_value=mock_builder):
|
|
124
|
+
with patch("faceberg.discover.HfApi") as mock_api_class:
|
|
125
|
+
mock_api = MagicMock()
|
|
126
|
+
mock_api.dataset_info.return_value = mock_dataset_info
|
|
127
|
+
mock_api_class.return_value = mock_api
|
|
128
|
+
|
|
129
|
+
result = discover_dataset("squad", "plain_text", token="test_token")
|
|
130
|
+
|
|
131
|
+
# Verify result
|
|
132
|
+
assert result.repo_id == "squad"
|
|
133
|
+
assert result.config == "plain_text"
|
|
134
|
+
assert result.revision == "abc123def456"
|
|
135
|
+
assert result.splits == ["train"]
|
|
136
|
+
assert result.data_dir == "data"
|
|
137
|
+
assert len(result.files) == 1
|
|
138
|
+
assert result.files[0].uri == "hf://datasets/squad@abc123def456/data/train-00000.parquet"
|
|
139
|
+
assert result.files[0].size == 1024
|
|
140
|
+
assert result.files[0].blob_id == "blob123"
|
|
141
|
+
assert result.files[0].split == "train"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_discover_dataset_multiple_splits():
|
|
145
|
+
"""Test discovery with multiple splits."""
|
|
146
|
+
# Mock builder
|
|
147
|
+
mock_builder = MagicMock()
|
|
148
|
+
mock_builder.hash = "xyz789"
|
|
149
|
+
mock_builder.info.features = Features({"text": Value("string")})
|
|
150
|
+
mock_builder.config.data_files = {
|
|
151
|
+
"train": ["hf://datasets/squad@xyz789/data/train-00000.parquet"],
|
|
152
|
+
"test": ["hf://datasets/squad@xyz789/data/test-00000.parquet"],
|
|
153
|
+
"validation": ["hf://datasets/squad@xyz789/data/validation-00000.parquet"],
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
# Mock HuggingFace API response
|
|
157
|
+
mock_siblings = [
|
|
158
|
+
Mock(rfilename="data/train-00000.parquet", size=1024, oid="blob1"),
|
|
159
|
+
Mock(rfilename="data/test-00000.parquet", size=512, oid="blob2"),
|
|
160
|
+
Mock(rfilename="data/validation-00000.parquet", size=256, oid="blob3"),
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
mock_dataset_info = MagicMock()
|
|
164
|
+
mock_dataset_info.siblings = mock_siblings
|
|
165
|
+
|
|
166
|
+
with patch("faceberg.discover.dataset_builder_safe", return_value=mock_builder):
|
|
167
|
+
with patch("faceberg.discover.HfApi") as mock_api_class:
|
|
168
|
+
mock_api = MagicMock()
|
|
169
|
+
mock_api.dataset_info.return_value = mock_dataset_info
|
|
170
|
+
mock_api_class.return_value = mock_api
|
|
171
|
+
|
|
172
|
+
result = discover_dataset("squad", "plain_text")
|
|
173
|
+
|
|
174
|
+
# Verify result
|
|
175
|
+
assert result.splits == ["train", "test", "validation"]
|
|
176
|
+
assert len(result.files) == 3
|
|
177
|
+
|
|
178
|
+
# Check each file has correct split
|
|
179
|
+
train_files = [f for f in result.files if f.split == "train"]
|
|
180
|
+
test_files = [f for f in result.files if f.split == "test"]
|
|
181
|
+
val_files = [f for f in result.files if f.split == "validation"]
|
|
182
|
+
|
|
183
|
+
assert len(train_files) == 1
|
|
184
|
+
assert len(test_files) == 1
|
|
185
|
+
assert len(val_files) == 1
|
|
186
|
+
|
|
187
|
+
assert train_files[0].size == 1024
|
|
188
|
+
assert test_files[0].size == 512
|
|
189
|
+
assert val_files[0].size == 256
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def test_discover_dataset_empty():
|
|
193
|
+
"""Test discovery of dataset with no files."""
|
|
194
|
+
# Mock builder with no data files
|
|
195
|
+
mock_builder = MagicMock()
|
|
196
|
+
mock_builder.hash = "empty123"
|
|
197
|
+
mock_builder.info.features = Features({"text": Value("string")})
|
|
198
|
+
mock_builder.config.data_files = {}
|
|
199
|
+
|
|
200
|
+
mock_dataset_info = MagicMock()
|
|
201
|
+
mock_dataset_info.siblings = []
|
|
202
|
+
|
|
203
|
+
with patch("faceberg.discover.dataset_builder_safe", return_value=mock_builder):
|
|
204
|
+
with patch("faceberg.discover.HfApi") as mock_api_class:
|
|
205
|
+
mock_api = MagicMock()
|
|
206
|
+
mock_api.dataset_info.return_value = mock_dataset_info
|
|
207
|
+
mock_api_class.return_value = mock_api
|
|
208
|
+
|
|
209
|
+
result = discover_dataset("empty", "default")
|
|
210
|
+
|
|
211
|
+
# Verify result
|
|
212
|
+
assert result.repo_id == "empty"
|
|
213
|
+
assert result.splits == []
|
|
214
|
+
assert result.files == []
|
|
215
|
+
assert result.data_dir == ""
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def test_discover_dataset_missing_file_metadata():
|
|
219
|
+
"""Test that missing file metadata raises ValueError."""
|
|
220
|
+
# Mock builder with file that won't be in metadata
|
|
221
|
+
mock_builder = MagicMock()
|
|
222
|
+
mock_builder.hash = "missing123"
|
|
223
|
+
mock_builder.info.features = Features({"text": Value("string")})
|
|
224
|
+
mock_builder.config.data_files = {
|
|
225
|
+
"train": ["hf://datasets/squad@missing123/data/missing.parquet"],
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
# Mock HuggingFace API response without the file
|
|
229
|
+
mock_dataset_info = MagicMock()
|
|
230
|
+
mock_dataset_info.siblings = [] # No files in metadata
|
|
231
|
+
|
|
232
|
+
with patch("faceberg.discover.dataset_builder_safe", return_value=mock_builder):
|
|
233
|
+
with patch("faceberg.discover.HfApi") as mock_api_class:
|
|
234
|
+
mock_api = MagicMock()
|
|
235
|
+
mock_api.dataset_info.return_value = mock_dataset_info
|
|
236
|
+
mock_api_class.return_value = mock_api
|
|
237
|
+
|
|
238
|
+
with pytest.raises(ValueError, match="not found in Hub API response"):
|
|
239
|
+
discover_dataset("squad", "plain_text")
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def test_discover_dataset_invalid_dataset():
|
|
243
|
+
"""Test that invalid dataset raises ValueError."""
|
|
244
|
+
with patch("faceberg.discover.dataset_builder_safe") as mock_safe:
|
|
245
|
+
mock_safe.side_effect = Exception("Dataset not found")
|
|
246
|
+
|
|
247
|
+
with pytest.raises(ValueError, match="not found or not accessible"):
|
|
248
|
+
discover_dataset("invalid", "config")
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def test_discover_dataset_invalid_config():
|
|
252
|
+
"""Test that invalid config raises ValueError."""
|
|
253
|
+
with patch("faceberg.discover.dataset_builder_safe") as mock_safe:
|
|
254
|
+
mock_safe.side_effect = Exception("Config 'invalid' not found")
|
|
255
|
+
|
|
256
|
+
with pytest.raises(ValueError, match="not found or not accessible"):
|
|
257
|
+
discover_dataset("squad", "invalid")
|