faceberg 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,422 +0,0 @@
1
- """Tests for the convert module (Iceberg metadata generation)."""
2
-
3
- from unittest.mock import Mock, patch
4
-
5
- import pytest
6
- from pyiceberg.schema import Schema
7
- from pyiceberg.types import IntegerType, NestedField, StringType
8
-
9
- from faceberg.bridge import FileInfo
10
- from faceberg.convert import IcebergMetadataWriter
11
-
12
-
13
- @pytest.fixture
14
- def temp_table_path(tmp_path):
15
- """Create a temporary table path for testing."""
16
- table_path = tmp_path / "test_namespace" / "test_table"
17
- table_path.mkdir(parents=True, exist_ok=True)
18
- return table_path
19
-
20
-
21
- @pytest.fixture
22
- def simple_schema():
23
- """Create a simple Iceberg schema for testing."""
24
- return Schema(
25
- NestedField(field_id=1, name="id", field_type=StringType(), required=False),
26
- NestedField(field_id=2, name="value", field_type=IntegerType(), required=False),
27
- )
28
-
29
-
30
- @pytest.fixture
31
- def metadata_writer(temp_table_path, simple_schema):
32
- """Create a metadata writer instance for testing."""
33
- # Construct file:// URI for the temp path
34
- path_str = temp_table_path.absolute().as_posix()
35
- base_uri = f"file:///{path_str.lstrip('/')}"
36
- return IcebergMetadataWriter(
37
- table_path=temp_table_path, schema=simple_schema, base_uri=base_uri
38
- )
39
-
40
-
41
- class TestGetHfFileSize:
42
- """Tests for the _get_hf_file_size method."""
43
-
44
- def test_get_hf_file_size_valid_url(self, metadata_writer):
45
- """Test getting file size from a valid HuggingFace URL."""
46
- test_url = "hf://datasets/deepmind/narrativeqa/data/train-00000-of-00024.parquet"
47
-
48
- with (
49
- patch("faceberg.convert.hf_hub_url") as mock_hf_hub_url,
50
- patch("faceberg.convert.get_hf_file_metadata") as mock_get_metadata,
51
- ):
52
- # Setup mocks
53
- mock_hf_hub_url.return_value = "https://huggingface.co/mock-url"
54
-
55
- mock_metadata = Mock()
56
- mock_metadata.size = 9799947
57
- mock_get_metadata.return_value = mock_metadata
58
-
59
- # Test
60
- file_size = metadata_writer._get_hf_file_size(test_url)
61
-
62
- # Verify
63
- assert file_size == 9799947
64
- mock_hf_hub_url.assert_called_once_with(
65
- repo_id="deepmind/narrativeqa",
66
- filename="data/train-00000-of-00024.parquet",
67
- repo_type="dataset",
68
- revision=None,
69
- )
70
- mock_get_metadata.assert_called_once()
71
-
72
- def test_get_hf_file_size_nested_path(self, metadata_writer):
73
- """Test getting file size from a URL with deeply nested path."""
74
- test_url = "hf://datasets/org/repo/path/to/deep/file.parquet"
75
-
76
- with (
77
- patch("faceberg.convert.hf_hub_url") as mock_hf_hub_url,
78
- patch("faceberg.convert.get_hf_file_metadata") as mock_get_metadata,
79
- ):
80
- mock_hf_hub_url.return_value = "https://mock.url"
81
- mock_metadata = Mock()
82
- mock_metadata.size = 12345678
83
- mock_get_metadata.return_value = mock_metadata
84
-
85
- file_size = metadata_writer._get_hf_file_size(test_url)
86
-
87
- assert file_size == 12345678
88
- mock_hf_hub_url.assert_called_once_with(
89
- repo_id="org/repo",
90
- filename="path/to/deep/file.parquet",
91
- repo_type="dataset",
92
- revision=None,
93
- )
94
-
95
- def test_get_hf_file_size_invalid_url_format(self, metadata_writer):
96
- """Test handling of invalid URL format."""
97
- import pytest
98
-
99
- # Not a hf:// URL
100
- with pytest.raises(ValueError, match="Invalid HuggingFace file path"):
101
- metadata_writer._get_hf_file_size("s3://bucket/file.parquet")
102
-
103
- # Invalid hf:// URL (too few parts)
104
- with pytest.raises(ValueError, match="Invalid HuggingFace file path format"):
105
- metadata_writer._get_hf_file_size("hf://datasets/repo")
106
-
107
- def test_get_hf_file_size_api_error(self, metadata_writer):
108
- """Test handling of HuggingFace API errors."""
109
- import pytest
110
-
111
- test_url = "hf://datasets/org/repo/file.parquet"
112
-
113
- with (
114
- patch("faceberg.convert.hf_hub_url") as mock_hf_hub_url,
115
- patch("faceberg.convert.get_hf_file_metadata") as mock_get_metadata,
116
- ):
117
- mock_hf_hub_url.return_value = "https://mock.url"
118
- # Simulate API error
119
- mock_get_metadata.side_effect = Exception("API Error")
120
-
121
- # Should raise the API error (fail-fast behavior)
122
- with pytest.raises(Exception, match="API Error"):
123
- metadata_writer._get_hf_file_size(test_url)
124
-
125
-
126
- class TestReadFileMetadata:
127
- """Tests for the _read_file_metadata method."""
128
-
129
- def test_read_file_metadata_gets_file_size(self, metadata_writer):
130
- """Test that _read_file_metadata gets file size from HuggingFace when size_bytes is 0."""
131
- file_infos = [
132
- FileInfo(
133
- uri="hf://datasets/org/repo/file1.parquet",
134
- size_bytes=0, # No size provided
135
- row_count=0,
136
- split="train",
137
- )
138
- ]
139
-
140
- with (
141
- patch("faceberg.convert.pq.read_metadata") as mock_read_metadata,
142
- patch.object(metadata_writer, "_get_hf_file_size") as mock_get_size,
143
- ):
144
- # Mock parquet metadata
145
- mock_metadata = Mock()
146
- mock_metadata.num_rows = 1000
147
- mock_read_metadata.return_value = mock_metadata
148
-
149
- # Mock file size from HuggingFace
150
- mock_get_size.return_value = 9876543
151
-
152
- # Test
153
- enriched = metadata_writer._read_file_metadata(file_infos)
154
-
155
- # Verify
156
- assert len(enriched) == 1
157
- assert enriched[0].uri == "hf://datasets/org/repo/file1.parquet"
158
- assert enriched[0].size_bytes == 9876543
159
- assert enriched[0].row_count == 1000
160
- mock_get_size.assert_called_once_with("hf://datasets/org/repo/file1.parquet")
161
-
162
- def test_read_file_metadata_preserves_provided_size(self, metadata_writer):
163
- """Test that _read_file_metadata preserves size_bytes when already provided."""
164
- file_infos = [
165
- FileInfo(
166
- uri="hf://datasets/org/repo/file1.parquet",
167
- size_bytes=5555555, # Size already provided
168
- row_count=0,
169
- split="train",
170
- )
171
- ]
172
-
173
- with (
174
- patch("faceberg.convert.pq.read_metadata") as mock_read_metadata,
175
- patch.object(metadata_writer, "_get_hf_file_size") as mock_get_size,
176
- ):
177
- mock_metadata = Mock()
178
- mock_metadata.num_rows = 1000
179
- mock_read_metadata.return_value = mock_metadata
180
-
181
- enriched = metadata_writer._read_file_metadata(file_infos)
182
-
183
- # Should use provided size, not call _get_hf_file_size
184
- assert enriched[0].size_bytes == 5555555
185
- mock_get_size.assert_not_called()
186
-
187
- def test_read_file_metadata_multiple_files(self, metadata_writer):
188
- """Test enriching metadata for multiple files."""
189
- file_infos = [
190
- FileInfo(
191
- uri="hf://datasets/org/repo/file1.parquet",
192
- size_bytes=0,
193
- row_count=0,
194
- split="train",
195
- ),
196
- FileInfo(
197
- uri="hf://datasets/org/repo/file2.parquet",
198
- size_bytes=0,
199
- row_count=0,
200
- split="train",
201
- ),
202
- FileInfo(
203
- uri="hf://datasets/org/repo/file3.parquet",
204
- size_bytes=123456, # Already has size
205
- row_count=500, # This will be overwritten by reading parquet metadata
206
- split="test",
207
- ),
208
- ]
209
-
210
- with (
211
- patch("faceberg.convert.pq.read_metadata") as mock_read_metadata,
212
- patch.object(metadata_writer, "_get_hf_file_size") as mock_get_size,
213
- ):
214
- # Mock parquet metadata - return different row counts for each file
215
- def get_metadata_side_effect(path):
216
- mock_metadata = Mock()
217
- if "file1" in path:
218
- mock_metadata.num_rows = 1000
219
- elif "file2" in path:
220
- mock_metadata.num_rows = 2000
221
- else: # file3
222
- mock_metadata.num_rows = 3000
223
- return mock_metadata
224
-
225
- mock_read_metadata.side_effect = get_metadata_side_effect
226
-
227
- # Mock file sizes from HuggingFace for files without size
228
- mock_get_size.side_effect = [9999999, 8888888]
229
-
230
- enriched = metadata_writer._read_file_metadata(file_infos)
231
-
232
- assert len(enriched) == 3
233
- # File 1: no size, gets it from HuggingFace
234
- assert enriched[0].size_bytes == 9999999
235
- assert enriched[0].row_count == 1000
236
- # File 2: no size, gets it from HuggingFace
237
- assert enriched[1].size_bytes == 8888888
238
- assert enriched[1].row_count == 2000
239
- # File 3: has size, uses it, but row_count is read from parquet metadata
240
- assert enriched[2].size_bytes == 123456
241
- assert enriched[2].row_count == 3000 # Overwritten by parquet metadata
242
- # Should only call _get_hf_file_size for first two files (file3 has size)
243
- assert mock_get_size.call_count == 2
244
-
245
- def test_read_file_metadata_handles_read_error(self, metadata_writer):
246
- """Test that metadata read errors are raised (fail-fast behavior)."""
247
- import pytest
248
-
249
- file_infos = [
250
- FileInfo(
251
- uri="hf://datasets/org/repo/file1.parquet",
252
- size_bytes=0,
253
- row_count=0,
254
- split="train",
255
- )
256
- ]
257
-
258
- with patch("faceberg.convert.pq.read_metadata") as mock_read_metadata:
259
- # Simulate read error
260
- mock_read_metadata.side_effect = Exception("Cannot read metadata")
261
-
262
- # Should raise the error (fail-fast behavior)
263
- with pytest.raises(Exception, match="Cannot read metadata"):
264
- metadata_writer._read_file_metadata(file_infos)
265
-
266
-
267
- class TestFileSizeRegression:
268
- """Regression tests to ensure the bug fix works correctly."""
269
-
270
- def test_file_size_not_using_serialized_size(self, metadata_writer):
271
- """Regression test: ensure we don't use metadata.serialized_size (the original bug)."""
272
- # This is the key regression test for the bug fix
273
- file_infos = [
274
- FileInfo(
275
- uri="hf://datasets/deepmind/narrativeqa/data/train-00000-of-00024.parquet",
276
- size_bytes=0,
277
- row_count=0,
278
- split="train",
279
- )
280
- ]
281
-
282
- with (
283
- patch("faceberg.convert.pq.read_metadata") as mock_read_metadata,
284
- patch.object(metadata_writer, "_get_hf_file_size") as mock_get_size,
285
- ):
286
- # The bug was using metadata.serialized_size which is ~500 bytes
287
- mock_metadata = Mock()
288
- mock_metadata.num_rows = 1365
289
- mock_metadata.serialized_size = 550 # This is the WRONG value that was used before
290
-
291
- mock_read_metadata.return_value = mock_metadata
292
-
293
- # The correct file size from HuggingFace API
294
- mock_get_size.return_value = 9799947
295
-
296
- enriched = metadata_writer._read_file_metadata(file_infos)
297
-
298
- # Verify we're using the correct file size, not serialized_size
299
- assert enriched[0].size_bytes == 9799947
300
- assert enriched[0].size_bytes != 550
301
- # The ratio should be reasonable (actual size vs metadata footer size)
302
- assert enriched[0].size_bytes / mock_metadata.serialized_size > 1000
303
-
304
- def test_file_sizes_match_real_world_ratios(self, metadata_writer):
305
- """Test that file sizes match expected ratios from real-world HuggingFace datasets."""
306
- # From the bug report, we saw ratios of 500-19000x between actual and serialized_size
307
- file_infos = [
308
- FileInfo(
309
- uri=f"hf://datasets/deepmind/narrativeqa/data/train-{i:05d}-of-00024.parquet",
310
- size_bytes=0,
311
- row_count=0,
312
- split="train",
313
- )
314
- for i in range(5)
315
- ]
316
-
317
- # Real-world compressed data sizes (excluding footer)
318
- compressed_sizes = [9766702, 67176993, 232523620, 27221729, 88315563]
319
- # Typical metadata.serialized_size values (footer size)
320
- serialized_sizes = [18853, 10532, 11971, 9938, 19011]
321
-
322
- with (
323
- patch("faceberg.convert.pq.read_metadata") as mock_read_metadata,
324
- patch.object(metadata_writer, "_get_hf_file_size") as mock_get_size,
325
- ):
326
-
327
- def get_metadata_side_effect(path):
328
- idx = int(path.split("train-")[1].split("-of")[0])
329
- mock_metadata = Mock()
330
- mock_metadata.num_rows = 1000
331
- mock_metadata.serialized_size = serialized_sizes[idx]
332
- mock_metadata.num_row_groups = 1
333
-
334
- # Mock row group with single column containing all compressed data
335
- mock_rg = Mock()
336
- mock_rg.num_columns = 1
337
- mock_col = Mock(total_compressed_size=compressed_sizes[idx])
338
- mock_rg.column = Mock(return_value=mock_col)
339
- mock_metadata.row_group = Mock(return_value=mock_rg)
340
-
341
- return mock_metadata
342
-
343
- mock_read_metadata.side_effect = get_metadata_side_effect
344
- # Mock the file size to return calculated size (compressed + footer + 8)
345
- mock_get_size.side_effect = lambda path: (
346
- compressed_sizes[int(path.split("train-")[1].split("-of")[0])]
347
- + serialized_sizes[int(path.split("train-")[1].split("-of")[0])]
348
- + 8
349
- )
350
-
351
- enriched = metadata_writer._read_file_metadata(file_infos)
352
-
353
- # Verify all files have correct sizes (compressed + footer + 8 bytes)
354
- for i, file_info in enumerate(enriched):
355
- expected = compressed_sizes[i] + serialized_sizes[i] + 8
356
- assert file_info.size_bytes == expected
357
- # Verify we're not using just the footer
358
- assert file_info.size_bytes != serialized_sizes[i]
359
- # Verify the ratio is in the expected range
360
- ratio = file_info.size_bytes / serialized_sizes[i]
361
- assert 500 <= ratio <= 20000 # Based on real-world observations
362
-
363
-
364
- class TestGetPreviousManifests:
365
- """Tests for the _get_previous_manifests method for fast append optimization."""
366
-
367
- def test_no_snapshots_returns_none(self, metadata_writer):
368
- """Test that None is returned when metadata has no snapshots."""
369
- from pyiceberg.table.metadata import TableMetadataV2
370
-
371
- # Create metadata with no snapshots
372
- metadata = Mock(spec=TableMetadataV2)
373
- metadata.current_snapshot_id = None
374
- metadata.snapshots = []
375
-
376
- result = metadata_writer._get_previous_manifests(metadata)
377
- assert result is None
378
-
379
- def test_returns_manifest_files_without_reading_contents(self, metadata_writer):
380
- """Test that ManifestFile objects are returned without fetching their entries."""
381
- from pyiceberg.manifest import ManifestFile
382
- from pyiceberg.table.metadata import TableMetadataV2
383
- from pyiceberg.table.snapshots import Snapshot
384
-
385
- # Create mock manifest files
386
- mock_manifest_1 = Mock(spec=ManifestFile)
387
- mock_manifest_1.manifest_path = "hf://datasets/org/repo/metadata/manifest1.avro"
388
-
389
- mock_manifest_2 = Mock(spec=ManifestFile)
390
- mock_manifest_2.manifest_path = "hf://datasets/org/repo/metadata/manifest2.avro"
391
-
392
- # Create mock snapshot
393
- mock_snapshot = Mock(spec=Snapshot)
394
- mock_snapshot.snapshot_id = 1
395
- mock_snapshot.manifests.return_value = [mock_manifest_1, mock_manifest_2]
396
-
397
- # Create metadata
398
- metadata = Mock(spec=TableMetadataV2)
399
- metadata.current_snapshot_id = 1
400
- metadata.snapshots = [mock_snapshot]
401
-
402
- # Test
403
- result = metadata_writer._get_previous_manifests(metadata)
404
-
405
- # Verify - should return manifest files
406
- assert result is not None
407
- assert len(result) == 2
408
- assert result[0] == mock_manifest_1
409
- assert result[1] == mock_manifest_2
410
-
411
- # Critical: verify we did NOT call fetch_manifest_entry (no content reading)
412
- assert (
413
- not hasattr(mock_manifest_1, "fetch_manifest_entry")
414
- or not mock_manifest_1.fetch_manifest_entry.called
415
- )
416
- assert (
417
- not hasattr(mock_manifest_2, "fetch_manifest_entry")
418
- or not mock_manifest_2.fetch_manifest_entry.called
419
- )
420
-
421
- # Verify we called manifests() with file_io
422
- mock_snapshot.manifests.assert_called_once_with(metadata_writer.file_io)
@@ -1,175 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: faceberg
3
- Version: 0.1.0
4
- Summary: Bridge HuggingFace datasets with Apache Iceberg
5
- Project-URL: Homepage, https://github.com/kszucs/faceberg
6
- Project-URL: Documentation, https://github.com/kszucs/faceberg
7
- Project-URL: Repository, https://github.com/kszucs/faceberg
8
- Author-email: Krisztian Szucs <kszucs@users.noreply.github.com>
9
- License: Apache-2.0
10
- License-File: LICENSE
11
- Keywords: data-lake,datasets,huggingface,iceberg
12
- Classifier: Development Status :: 3 - Alpha
13
- Classifier: Intended Audience :: Developers
14
- Classifier: License :: OSI Approved :: Apache Software License
15
- Classifier: Programming Language :: Python :: 3
16
- Classifier: Programming Language :: Python :: 3.9
17
- Classifier: Programming Language :: Python :: 3.10
18
- Classifier: Programming Language :: Python :: 3.11
19
- Classifier: Programming Language :: Python :: 3.12
20
- Requires-Python: >=3.9
21
- Requires-Dist: click>=8.0.0
22
- Requires-Dist: datasets>=2.0.0
23
- Requires-Dist: fsspec>=2023.1.0
24
- Requires-Dist: huggingface-hub>=0.20.0
25
- Requires-Dist: jinja2>=3.1.6
26
- Requires-Dist: litestar>=2.0.0
27
- Requires-Dist: pyarrow>=21.0.0
28
- Requires-Dist: pyiceberg>=0.6.0
29
- Requires-Dist: pyyaml>=6.0
30
- Requires-Dist: rich>=13.0.0
31
- Requires-Dist: uuid-utils>=0.9.0
32
- Requires-Dist: uvicorn[standard]>=0.27.0
33
- Provides-Extra: dev
34
- Requires-Dist: black>=23.0.0; extra == 'dev'
35
- Requires-Dist: duckdb>=0.10.0; extra == 'dev'
36
- Requires-Dist: mypy>=1.0.0; extra == 'dev'
37
- Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
38
- Requires-Dist: pytest-playwright>=0.7.0; extra == 'dev'
39
- Requires-Dist: pytest>=7.0.0; extra == 'dev'
40
- Requires-Dist: requests>=2.31.0; extra == 'dev'
41
- Requires-Dist: ruff>=0.1.0; extra == 'dev'
42
- Description-Content-Type: text/markdown
43
-
44
- ![Faceberg](faceberg.png)
45
-
46
- # Faceberg
47
-
48
- Bridge HuggingFace datasets with Apache Iceberg tables.
49
-
50
- ## Installation
51
-
52
- ```bash
53
- pip install faceberg
54
- ```
55
-
56
- ## Quick Start
57
-
58
- ```bash
59
- # Create a catalog and add a dataset
60
- faceberg mycatalog init
61
- faceberg mycatalog add stanfordnlp/imdb --config plain_text
62
- faceberg mycatalog sync
63
-
64
- # Query the data
65
- faceberg mycatalog scan default.imdb --limit 5
66
- ```
67
-
68
- **Python API:**
69
-
70
- ```python
71
- from faceberg import catalog
72
-
73
- cat = catalog("mycatalog")
74
- table = cat.load_table("default.imdb")
75
- df = table.scan().to_pandas()
76
- print(df.head())
77
- ```
78
-
79
- **Documentation:**
80
- - [Getting Started](docs/index.qmd) - Quickstart guide
81
- - [Local Catalogs](docs/local.qmd) - Use local catalogs for testing
82
- - [DuckDB Integration](docs/integrations/duckdb.qmd) - Query with SQL
83
- - [Pandas Integration](docs/integrations/pandas.qmd) - Load into DataFrames
84
-
85
- ## How It Works
86
-
87
- Faceberg creates lightweight Iceberg metadata that points to original HuggingFace dataset files:
88
-
89
- ```
90
- HuggingFace Dataset Your Catalog
91
- ┌─────────────────┐ ┌──────────────────┐
92
- │ org/dataset │ │ mycatalog/ │
93
- │ ├── train.pq ◄──┼─────────┼─ default/ │
94
- │ └── test.pq ◄──┼─────────┼─ └── imdb/ │
95
- └─────────────────┘ │ └── metadata/
96
- └──────────────────┘
97
- ```
98
-
99
- No data is copied—only metadata is created. Query with DuckDB, PyIceberg, Spark, or any Iceberg-compatible tool.
100
-
101
- ## Usage
102
-
103
- ### CLI Commands
104
-
105
- ```bash
106
- # Initialize catalog
107
- faceberg mycatalog init
108
-
109
- # Add datasets
110
- faceberg mycatalog add openai/gsm8k --config main
111
-
112
- # Sync datasets (creates Iceberg metadata)
113
- faceberg mycatalog sync
114
-
115
- # List tables
116
- faceberg mycatalog list
117
-
118
- # Show table info
119
- faceberg mycatalog info default.gsm8k
120
-
121
- # Scan data
122
- faceberg mycatalog scan default.gsm8k --limit 10
123
-
124
- # Start REST server
125
- faceberg mycatalog serve --port 8181
126
- ```
127
-
128
- ### Remote Catalogs on HuggingFace Hub
129
-
130
- ```bash
131
- # Initialize remote catalog
132
- export HF_TOKEN=your_token
133
- faceberg org/catalog-repo init
134
-
135
- # Add and sync datasets
136
- faceberg org/catalog-repo add deepmind/code_contests --config default
137
- faceberg org/catalog-repo sync
138
-
139
- # Serve remote catalog
140
- faceberg org/catalog-repo serve
141
- ```
142
-
143
- ### Query with DuckDB
144
-
145
- ```python
146
- import duckdb
147
-
148
- conn = duckdb.connect()
149
- conn.execute("INSTALL httpfs; LOAD httpfs")
150
- conn.execute("INSTALL iceberg; LOAD iceberg")
151
-
152
- # Query local catalog
153
- result = conn.execute("""
154
- SELECT * FROM iceberg_scan('mycatalog/default/imdb/metadata/v1.metadata.json')
155
- LIMIT 10
156
- """).fetchall()
157
-
158
- # Query remote catalog
159
- result = conn.execute("""
160
- SELECT * FROM iceberg_scan('hf://datasets/org/catalog/default/table/metadata/v1.metadata.json')
161
- LIMIT 10
162
- """).fetchall()
163
- ```
164
-
165
- ## Development
166
-
167
- ```bash
168
- git clone https://github.com/kszucs/faceberg
169
- cd faceberg
170
- pip install -e .
171
- ```
172
-
173
- ## License
174
-
175
- Apache 2.0