faceberg 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@ from pyiceberg.exceptions import (
14
14
  from pyiceberg.io.fsspec import FsspecFileIO
15
15
  from pyiceberg.partitioning import PartitionField, PartitionSpec
16
16
  from pyiceberg.schema import Schema
17
+ from pyiceberg.table.snapshots import Operation
17
18
  from pyiceberg.transforms import IdentityTransform
18
19
  from pyiceberg.types import LongType, NestedField, StringType
19
20
 
@@ -460,8 +461,6 @@ class TestTableAppend:
460
461
  latest_snapshot = snapshots_after[-1]
461
462
  assert latest_snapshot.summary is not None
462
463
  # Summary.operation is an enum, not a string
463
- from pyiceberg.table.snapshots import Operation
464
-
465
464
  assert latest_snapshot.summary.operation == Operation.APPEND
466
465
 
467
466
  def test_append_data_partition_integrity(self, writable_catalog):
@@ -0,0 +1,257 @@
1
+ # tests/test_discover.py
2
+ import os
3
+ from unittest.mock import MagicMock, Mock, patch
4
+
5
+ import pytest
6
+ from datasets import Features, Value
7
+
8
+ from faceberg.discover import DatasetInfo, ParquetFile, dataset_builder_safe, discover_dataset
9
+
10
+
11
+ def test_parquet_file_creation():
12
+ """Test creating a ParquetFile with all fields."""
13
+ pf = ParquetFile(
14
+ uri="hf://datasets/squad@abc123/train-00000.parquet",
15
+ path="train-00000.parquet",
16
+ size=1024,
17
+ blob_id="abc123def456",
18
+ split="train",
19
+ )
20
+ assert pf.uri == "hf://datasets/squad@abc123/train-00000.parquet"
21
+ assert pf.path == "train-00000.parquet"
22
+ assert pf.size == 1024
23
+ assert pf.blob_id == "abc123def456"
24
+ assert pf.split == "train"
25
+
26
+
27
+ def test_parquet_file_optional_split():
28
+ """Test ParquetFile with optional split."""
29
+ pf = ParquetFile(
30
+ uri="hf://datasets/squad@abc123/data.parquet",
31
+ path="data.parquet",
32
+ size=2048,
33
+ blob_id="xyz789",
34
+ split=None,
35
+ )
36
+ assert pf.split is None
37
+
38
+
39
+ def test_dataset_info_creation():
40
+ """Test creating a DatasetInfo with all fields."""
41
+ features = Features({"text": Value("string")})
42
+ files = [
43
+ ParquetFile(
44
+ uri="hf://datasets/squad@abc123/train-00000.parquet",
45
+ path="train-00000.parquet",
46
+ size=1024,
47
+ blob_id="blob1",
48
+ split="train",
49
+ )
50
+ ]
51
+
52
+ info = DatasetInfo(
53
+ repo_id="squad",
54
+ config="plain_text",
55
+ revision="abc123",
56
+ features=features,
57
+ splits=["train", "test"],
58
+ data_dir="data",
59
+ files=files,
60
+ )
61
+
62
+ assert info.repo_id == "squad"
63
+ assert info.config == "plain_text"
64
+ assert info.revision == "abc123"
65
+ assert info.features == features
66
+ assert info.splits == ["train", "test"]
67
+ assert info.data_dir == "data"
68
+ assert len(info.files) == 1
69
+ assert info.files[0].uri == "hf://datasets/squad@abc123/train-00000.parquet"
70
+
71
+
72
+ def test_dataset_builder_safe_changes_directory(tmp_path):
73
+ """Test that dataset_builder_safe changes to temp directory."""
74
+ original_cwd = os.getcwd()
75
+
76
+ with patch("faceberg.discover.load_dataset_builder") as mock_load:
77
+ mock_builder = MagicMock()
78
+ mock_load.return_value = mock_builder
79
+
80
+ result = dataset_builder_safe("squad", "plain_text", token="test_token")
81
+
82
+ # Should be back in original directory
83
+ assert os.getcwd() == original_cwd
84
+ assert result == mock_builder
85
+ mock_load.assert_called_once_with("squad", "plain_text", token="test_token")
86
+
87
+
88
+ def test_dataset_builder_safe_restores_directory_on_error(tmp_path):
89
+ """Test that directory is restored even on error."""
90
+ original_cwd = os.getcwd()
91
+
92
+ with patch("faceberg.discover.load_dataset_builder") as mock_load:
93
+ mock_load.side_effect = Exception("Load failed")
94
+
95
+ try:
96
+ dataset_builder_safe("squad", "plain_text")
97
+ except Exception:
98
+ pass
99
+
100
+ # Should be back in original directory even after error
101
+ assert os.getcwd() == original_cwd
102
+
103
+
104
+ def test_discover_dataset_basic():
105
+ """Test basic dataset discovery with mocked APIs."""
106
+ # Mock builder
107
+ mock_builder = MagicMock()
108
+ mock_builder.hash = "abc123def456"
109
+ mock_builder.info.features = Features({"text": Value("string")})
110
+ mock_builder.config.data_files = {
111
+ "train": ["hf://datasets/squad@abc123def456/data/train-00000.parquet"],
112
+ }
113
+
114
+ # Mock HuggingFace API response
115
+ mock_sibling = MagicMock()
116
+ mock_sibling.rfilename = "data/train-00000.parquet"
117
+ mock_sibling.size = 1024
118
+ mock_sibling.blob_id = "blob123"
119
+
120
+ mock_dataset_info = MagicMock()
121
+ mock_dataset_info.siblings = [mock_sibling]
122
+
123
+ with patch("faceberg.discover.dataset_builder_safe", return_value=mock_builder):
124
+ with patch("faceberg.discover.HfApi") as mock_api_class:
125
+ mock_api = MagicMock()
126
+ mock_api.dataset_info.return_value = mock_dataset_info
127
+ mock_api_class.return_value = mock_api
128
+
129
+ result = discover_dataset("squad", "plain_text", token="test_token")
130
+
131
+ # Verify result
132
+ assert result.repo_id == "squad"
133
+ assert result.config == "plain_text"
134
+ assert result.revision == "abc123def456"
135
+ assert result.splits == ["train"]
136
+ assert result.data_dir == "data"
137
+ assert len(result.files) == 1
138
+ assert result.files[0].uri == "hf://datasets/squad@abc123def456/data/train-00000.parquet"
139
+ assert result.files[0].size == 1024
140
+ assert result.files[0].blob_id == "blob123"
141
+ assert result.files[0].split == "train"
142
+
143
+
144
+ def test_discover_dataset_multiple_splits():
145
+ """Test discovery with multiple splits."""
146
+ # Mock builder
147
+ mock_builder = MagicMock()
148
+ mock_builder.hash = "xyz789"
149
+ mock_builder.info.features = Features({"text": Value("string")})
150
+ mock_builder.config.data_files = {
151
+ "train": ["hf://datasets/squad@xyz789/data/train-00000.parquet"],
152
+ "test": ["hf://datasets/squad@xyz789/data/test-00000.parquet"],
153
+ "validation": ["hf://datasets/squad@xyz789/data/validation-00000.parquet"],
154
+ }
155
+
156
+ # Mock HuggingFace API response
157
+ mock_siblings = [
158
+ Mock(rfilename="data/train-00000.parquet", size=1024, oid="blob1"),
159
+ Mock(rfilename="data/test-00000.parquet", size=512, oid="blob2"),
160
+ Mock(rfilename="data/validation-00000.parquet", size=256, oid="blob3"),
161
+ ]
162
+
163
+ mock_dataset_info = MagicMock()
164
+ mock_dataset_info.siblings = mock_siblings
165
+
166
+ with patch("faceberg.discover.dataset_builder_safe", return_value=mock_builder):
167
+ with patch("faceberg.discover.HfApi") as mock_api_class:
168
+ mock_api = MagicMock()
169
+ mock_api.dataset_info.return_value = mock_dataset_info
170
+ mock_api_class.return_value = mock_api
171
+
172
+ result = discover_dataset("squad", "plain_text")
173
+
174
+ # Verify result
175
+ assert result.splits == ["train", "test", "validation"]
176
+ assert len(result.files) == 3
177
+
178
+ # Check each file has correct split
179
+ train_files = [f for f in result.files if f.split == "train"]
180
+ test_files = [f for f in result.files if f.split == "test"]
181
+ val_files = [f for f in result.files if f.split == "validation"]
182
+
183
+ assert len(train_files) == 1
184
+ assert len(test_files) == 1
185
+ assert len(val_files) == 1
186
+
187
+ assert train_files[0].size == 1024
188
+ assert test_files[0].size == 512
189
+ assert val_files[0].size == 256
190
+
191
+
192
+ def test_discover_dataset_empty():
193
+ """Test discovery of dataset with no files."""
194
+ # Mock builder with no data files
195
+ mock_builder = MagicMock()
196
+ mock_builder.hash = "empty123"
197
+ mock_builder.info.features = Features({"text": Value("string")})
198
+ mock_builder.config.data_files = {}
199
+
200
+ mock_dataset_info = MagicMock()
201
+ mock_dataset_info.siblings = []
202
+
203
+ with patch("faceberg.discover.dataset_builder_safe", return_value=mock_builder):
204
+ with patch("faceberg.discover.HfApi") as mock_api_class:
205
+ mock_api = MagicMock()
206
+ mock_api.dataset_info.return_value = mock_dataset_info
207
+ mock_api_class.return_value = mock_api
208
+
209
+ result = discover_dataset("empty", "default")
210
+
211
+ # Verify result
212
+ assert result.repo_id == "empty"
213
+ assert result.splits == []
214
+ assert result.files == []
215
+ assert result.data_dir == ""
216
+
217
+
218
+ def test_discover_dataset_missing_file_metadata():
219
+ """Test that missing file metadata raises ValueError."""
220
+ # Mock builder with file that won't be in metadata
221
+ mock_builder = MagicMock()
222
+ mock_builder.hash = "missing123"
223
+ mock_builder.info.features = Features({"text": Value("string")})
224
+ mock_builder.config.data_files = {
225
+ "train": ["hf://datasets/squad@missing123/data/missing.parquet"],
226
+ }
227
+
228
+ # Mock HuggingFace API response without the file
229
+ mock_dataset_info = MagicMock()
230
+ mock_dataset_info.siblings = [] # No files in metadata
231
+
232
+ with patch("faceberg.discover.dataset_builder_safe", return_value=mock_builder):
233
+ with patch("faceberg.discover.HfApi") as mock_api_class:
234
+ mock_api = MagicMock()
235
+ mock_api.dataset_info.return_value = mock_dataset_info
236
+ mock_api_class.return_value = mock_api
237
+
238
+ with pytest.raises(ValueError, match="not found in Hub API response"):
239
+ discover_dataset("squad", "plain_text")
240
+
241
+
242
+ def test_discover_dataset_invalid_dataset():
243
+ """Test that invalid dataset raises ValueError."""
244
+ with patch("faceberg.discover.dataset_builder_safe") as mock_safe:
245
+ mock_safe.side_effect = Exception("Dataset not found")
246
+
247
+ with pytest.raises(ValueError, match="not found or not accessible"):
248
+ discover_dataset("invalid", "config")
249
+
250
+
251
+ def test_discover_dataset_invalid_config():
252
+ """Test that invalid config raises ValueError."""
253
+ with patch("faceberg.discover.dataset_builder_safe") as mock_safe:
254
+ mock_safe.side_effect = Exception("Config 'invalid' not found")
255
+
256
+ with pytest.raises(ValueError, match="not found or not accessible"):
257
+ discover_dataset("squad", "invalid")