faceberg 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faceberg-0.1.2/PKG-INFO +149 -0
- faceberg-0.1.2/README.md +106 -0
- faceberg-0.1.2/faceberg/_version.py +34 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/catalog.py +92 -76
- faceberg-0.1.2/faceberg/discover.py +181 -0
- faceberg-0.1.2/faceberg/iceberg.py +707 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_catalog.py +1 -2
- faceberg-0.1.2/faceberg/tests/test_discover.py +257 -0
- faceberg-0.1.2/faceberg/tests/test_iceberg.py +911 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/pyproject.toml +9 -3
- faceberg-0.1.0/PKG-INFO +0 -175
- faceberg-0.1.0/README.md +0 -132
- faceberg-0.1.0/faceberg/bridge.py +0 -586
- faceberg-0.1.0/faceberg/convert.py +0 -813
- faceberg-0.1.0/faceberg/tests/test_bridge.py +0 -825
- faceberg-0.1.0/faceberg/tests/test_convert.py +0 -422
- {faceberg-0.1.0 → faceberg-0.1.2}/.gitignore +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/LICENSE +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/__init__.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/cli.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/config.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/pretty.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/server.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/shell.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/spaces/Dockerfile +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/spaces/README.md +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/spaces/landing.html +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/__init__.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/conftest.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_catalog_duckdb.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_catalog_pandas.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_cli.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_config.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_pretty.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_server.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_server_playwright.py +0 -0
faceberg-0.1.2/PKG-INFO
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: faceberg
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Bridge HuggingFace datasets with Apache Iceberg
|
|
5
|
+
Project-URL: Homepage, https://github.com/kszucs/faceberg
|
|
6
|
+
Project-URL: Documentation, https://github.com/kszucs/faceberg
|
|
7
|
+
Project-URL: Repository, https://github.com/kszucs/faceberg
|
|
8
|
+
Author-email: Krisztian Szucs <kszucs@users.noreply.github.com>
|
|
9
|
+
License: Apache-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: data-lake,datasets,huggingface,iceberg
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Requires-Dist: click>=8.0.0
|
|
22
|
+
Requires-Dist: datasets>=2.0.0
|
|
23
|
+
Requires-Dist: fsspec>=2023.1.0
|
|
24
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
25
|
+
Requires-Dist: jinja2>=3.1.6
|
|
26
|
+
Requires-Dist: litestar>=2.0.0
|
|
27
|
+
Requires-Dist: pyarrow>=21.0.0
|
|
28
|
+
Requires-Dist: pyiceberg>=0.10.0
|
|
29
|
+
Requires-Dist: pyyaml>=6.0
|
|
30
|
+
Requires-Dist: rich>=13.0.0
|
|
31
|
+
Requires-Dist: uuid-utils>=0.9.0
|
|
32
|
+
Requires-Dist: uvicorn[standard]>=0.27.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: duckdb>=0.10.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-playwright>=0.7.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: requests>=2.31.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
|
|
44
|
+

|
|
45
|
+
|
|
46
|
+
# Faceberg
|
|
47
|
+
|
|
48
|
+
**Bridge HuggingFace datasets with Apache Iceberg tables — no data copying, just metadata.**
|
|
49
|
+
|
|
50
|
+
Faceberg maps HuggingFace datasets to Apache Iceberg tables. Your catalog metadata lives on HuggingFace Spaces with an auto-deployed REST API, and any Iceberg-compatible query engine can access the data.
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install faceberg
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quick Start
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
export HF_TOKEN=your_huggingface_token
|
|
62
|
+
|
|
63
|
+
# Create a catalog on HuggingFace Hub
|
|
64
|
+
faceberg user/mycatalog init
|
|
65
|
+
|
|
66
|
+
# Add datasets
|
|
67
|
+
faceberg user/mycatalog add stanfordnlp/imdb --config plain_text
|
|
68
|
+
faceberg user/mycatalog add openai/gsm8k --config main
|
|
69
|
+
|
|
70
|
+
# Query with interactive DuckDB shell
|
|
71
|
+
faceberg user/mycatalog quack
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
```sql
|
|
75
|
+
SELECT label, substr(text, 1, 100) as preview
|
|
76
|
+
FROM iceberg_catalog.stanfordnlp.imdb
|
|
77
|
+
LIMIT 10;
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## How It Works
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
HuggingFace Hub
|
|
84
|
+
┌─────────────────────────────────────────────────────────┐
|
|
85
|
+
│ │
|
|
86
|
+
│ ┌─────────────────────┐ ┌─────────────────────────┐ │
|
|
87
|
+
│ │ HF Datasets │ │ HF Spaces (Catalog) │ │
|
|
88
|
+
│ │ (Original Parquet) │◄───│ • Iceberg metadata │ │
|
|
89
|
+
│ │ │ │ • REST API endpoint │ │
|
|
90
|
+
│ │ stanfordnlp/imdb/ │ │ • faceberg.yml │ │
|
|
91
|
+
│ │ └── *.parquet │ │ │ │
|
|
92
|
+
│ └─────────────────────┘ └───────────┬─────────────┘ │
|
|
93
|
+
│ │ │
|
|
94
|
+
└─────────────────────────────────────────┼───────────────┘
|
|
95
|
+
│ Iceberg REST API
|
|
96
|
+
▼
|
|
97
|
+
┌─────────────────────────┐
|
|
98
|
+
│ Query Engines │
|
|
99
|
+
│ DuckDB, Pandas, Spark │
|
|
100
|
+
└─────────────────────────┘
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**No data is copied** — only metadata is created. Query with DuckDB, PyIceberg, Spark, or any Iceberg-compatible tool.
|
|
104
|
+
|
|
105
|
+
## Python API
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
import os
|
|
109
|
+
from faceberg import catalog
|
|
110
|
+
|
|
111
|
+
cat = catalog("user/mycatalog", hf_token=os.environ.get("HF_TOKEN"))
|
|
112
|
+
table = cat.load_table("stanfordnlp.imdb")
|
|
113
|
+
df = table.scan(limit=100).to_pandas()
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Share Your Catalog
|
|
117
|
+
|
|
118
|
+
Your catalog is accessible to anyone via the REST API:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
import duckdb
|
|
122
|
+
|
|
123
|
+
conn = duckdb.connect()
|
|
124
|
+
conn.execute("INSTALL iceberg; LOAD iceberg")
|
|
125
|
+
conn.execute("ATTACH 'https://user-mycatalog.hf.space' AS cat (TYPE ICEBERG)")
|
|
126
|
+
|
|
127
|
+
result = conn.execute("SELECT * FROM cat.stanfordnlp.imdb LIMIT 5").fetchdf()
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Documentation
|
|
131
|
+
|
|
132
|
+
**[Read the docs →](https://faceberg.kszucs.dev/)**
|
|
133
|
+
|
|
134
|
+
- [Getting Started](https://faceberg.kszucs.dev/) — Full quickstart guide
|
|
135
|
+
- [Local Catalogs](https://faceberg.kszucs.dev/local.html) — Use local catalogs for development
|
|
136
|
+
- [DuckDB Integration](https://faceberg.kszucs.dev/integrations/duckdb.html) — Advanced SQL queries
|
|
137
|
+
- [Pandas Integration](https://faceberg.kszucs.dev/integrations/pandas.html) — Load into DataFrames
|
|
138
|
+
|
|
139
|
+
## Development
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
git clone https://github.com/kszucs/faceberg
|
|
143
|
+
cd faceberg
|
|
144
|
+
pip install -e .
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## License
|
|
148
|
+
|
|
149
|
+
Apache 2.0
|
faceberg-0.1.2/README.md
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
# Faceberg
|
|
4
|
+
|
|
5
|
+
**Bridge HuggingFace datasets with Apache Iceberg tables — no data copying, just metadata.**
|
|
6
|
+
|
|
7
|
+
Faceberg maps HuggingFace datasets to Apache Iceberg tables. Your catalog metadata lives on HuggingFace Spaces with an auto-deployed REST API, and any Iceberg-compatible query engine can access the data.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install faceberg
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
export HF_TOKEN=your_huggingface_token
|
|
19
|
+
|
|
20
|
+
# Create a catalog on HuggingFace Hub
|
|
21
|
+
faceberg user/mycatalog init
|
|
22
|
+
|
|
23
|
+
# Add datasets
|
|
24
|
+
faceberg user/mycatalog add stanfordnlp/imdb --config plain_text
|
|
25
|
+
faceberg user/mycatalog add openai/gsm8k --config main
|
|
26
|
+
|
|
27
|
+
# Query with interactive DuckDB shell
|
|
28
|
+
faceberg user/mycatalog quack
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
```sql
|
|
32
|
+
SELECT label, substr(text, 1, 100) as preview
|
|
33
|
+
FROM iceberg_catalog.stanfordnlp.imdb
|
|
34
|
+
LIMIT 10;
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## How It Works
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
HuggingFace Hub
|
|
41
|
+
┌─────────────────────────────────────────────────────────┐
|
|
42
|
+
│ │
|
|
43
|
+
│ ┌─────────────────────┐ ┌─────────────────────────┐ │
|
|
44
|
+
│ │ HF Datasets │ │ HF Spaces (Catalog) │ │
|
|
45
|
+
│ │ (Original Parquet) │◄───│ • Iceberg metadata │ │
|
|
46
|
+
│ │ │ │ • REST API endpoint │ │
|
|
47
|
+
│ │ stanfordnlp/imdb/ │ │ • faceberg.yml │ │
|
|
48
|
+
│ │ └── *.parquet │ │ │ │
|
|
49
|
+
│ └─────────────────────┘ └───────────┬─────────────┘ │
|
|
50
|
+
│ │ │
|
|
51
|
+
└─────────────────────────────────────────┼───────────────┘
|
|
52
|
+
│ Iceberg REST API
|
|
53
|
+
▼
|
|
54
|
+
┌─────────────────────────┐
|
|
55
|
+
│ Query Engines │
|
|
56
|
+
│ DuckDB, Pandas, Spark │
|
|
57
|
+
└─────────────────────────┘
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**No data is copied** — only metadata is created. Query with DuckDB, PyIceberg, Spark, or any Iceberg-compatible tool.
|
|
61
|
+
|
|
62
|
+
## Python API
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
import os
|
|
66
|
+
from faceberg import catalog
|
|
67
|
+
|
|
68
|
+
cat = catalog("user/mycatalog", hf_token=os.environ.get("HF_TOKEN"))
|
|
69
|
+
table = cat.load_table("stanfordnlp.imdb")
|
|
70
|
+
df = table.scan(limit=100).to_pandas()
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Share Your Catalog
|
|
74
|
+
|
|
75
|
+
Your catalog is accessible to anyone via the REST API:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
import duckdb
|
|
79
|
+
|
|
80
|
+
conn = duckdb.connect()
|
|
81
|
+
conn.execute("INSTALL iceberg; LOAD iceberg")
|
|
82
|
+
conn.execute("ATTACH 'https://user-mycatalog.hf.space' AS cat (TYPE ICEBERG)")
|
|
83
|
+
|
|
84
|
+
result = conn.execute("SELECT * FROM cat.stanfordnlp.imdb LIMIT 5").fetchdf()
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Documentation
|
|
88
|
+
|
|
89
|
+
**[Read the docs →](https://faceberg.kszucs.dev/)**
|
|
90
|
+
|
|
91
|
+
- [Getting Started](https://faceberg.kszucs.dev/) — Full quickstart guide
|
|
92
|
+
- [Local Catalogs](https://faceberg.kszucs.dev/local.html) — Use local catalogs for development
|
|
93
|
+
- [DuckDB Integration](https://faceberg.kszucs.dev/integrations/duckdb.html) — Advanced SQL queries
|
|
94
|
+
- [Pandas Integration](https://faceberg.kszucs.dev/integrations/pandas.html) — Load into DataFrames
|
|
95
|
+
|
|
96
|
+
## Development
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
git clone https://github.com/kszucs/faceberg
|
|
100
|
+
cd faceberg
|
|
101
|
+
pip install -e .
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## License
|
|
105
|
+
|
|
106
|
+
Apache 2.0
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.1.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 2)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
|
@@ -4,7 +4,6 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
import shutil
|
|
6
6
|
import tempfile
|
|
7
|
-
import uuid
|
|
8
7
|
from contextlib import contextmanager
|
|
9
8
|
from pathlib import Path
|
|
10
9
|
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Set, Union
|
|
@@ -20,7 +19,7 @@ from pyiceberg.exceptions import (
|
|
|
20
19
|
NoSuchTableError,
|
|
21
20
|
TableAlreadyExistsError,
|
|
22
21
|
)
|
|
23
|
-
from pyiceberg.io import FileIO
|
|
22
|
+
from pyiceberg.io import FileIO, load_file_io
|
|
24
23
|
from pyiceberg.io.fsspec import FsspecFileIO
|
|
25
24
|
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionKey, PartitionSpec
|
|
26
25
|
from pyiceberg.schema import Schema
|
|
@@ -34,8 +33,8 @@ from pyiceberg.typedef import EMPTY_DICT, Properties
|
|
|
34
33
|
from uuid_utils import uuid7
|
|
35
34
|
|
|
36
35
|
from . import config as cfg
|
|
37
|
-
from .
|
|
38
|
-
from .
|
|
36
|
+
from .discover import discover_dataset
|
|
37
|
+
from .iceberg import write_snapshot
|
|
39
38
|
|
|
40
39
|
if TYPE_CHECKING:
|
|
41
40
|
import pyarrow as pa
|
|
@@ -361,8 +360,6 @@ class BaseCatalog(Catalog):
|
|
|
361
360
|
Returns:
|
|
362
361
|
FileIO instance with authentication configured
|
|
363
362
|
"""
|
|
364
|
-
from pyiceberg.io import load_file_io
|
|
365
|
-
|
|
366
363
|
# Start with catalog's persisted properties
|
|
367
364
|
props = dict(self.properties)
|
|
368
365
|
# Add runtime-only token if available
|
|
@@ -956,72 +953,82 @@ class BaseCatalog(Catalog):
|
|
|
956
953
|
identifier, state="in_progress", percent=0, stage="Discovering dataset"
|
|
957
954
|
)
|
|
958
955
|
|
|
959
|
-
dataset_info =
|
|
956
|
+
dataset_info = discover_dataset(
|
|
960
957
|
repo_id=repo,
|
|
961
958
|
config=config,
|
|
962
959
|
token=self._hf_token,
|
|
963
960
|
)
|
|
964
961
|
|
|
965
|
-
#
|
|
962
|
+
# Prepare schema with split column
|
|
966
963
|
if progress_callback:
|
|
967
|
-
progress_callback(
|
|
964
|
+
progress_callback(
|
|
965
|
+
identifier, state="in_progress", percent=10, stage="Converting schema"
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
if not dataset_info.files:
|
|
969
|
+
raise ValueError(f"No Parquet files found in dataset {repo}")
|
|
968
970
|
|
|
969
|
-
#
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
971
|
+
# Convert HuggingFace features to Arrow schema
|
|
972
|
+
arrow_schema = dataset_info.features.arrow_schema
|
|
973
|
+
|
|
974
|
+
# Build table properties
|
|
975
|
+
data_path = (
|
|
976
|
+
f"hf://datasets/{repo}/{dataset_info.data_dir}"
|
|
977
|
+
if dataset_info.data_dir
|
|
978
|
+
else f"hf://datasets/{repo}"
|
|
974
979
|
)
|
|
975
980
|
|
|
976
|
-
|
|
981
|
+
properties = {
|
|
982
|
+
"format-version": "2",
|
|
983
|
+
"write.parquet.compression-codec": "snappy",
|
|
984
|
+
"write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
|
|
985
|
+
"write.data.path": data_path,
|
|
986
|
+
"hf.dataset.repo": repo,
|
|
987
|
+
"hf.dataset.config": config,
|
|
988
|
+
"hf.dataset.revision": dataset_info.revision,
|
|
989
|
+
"hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
|
|
990
|
+
"hf.write.split": "train",
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
# Write Iceberg metadata
|
|
977
994
|
if progress_callback:
|
|
978
995
|
progress_callback(
|
|
979
|
-
identifier, state="in_progress", percent=
|
|
996
|
+
identifier, state="in_progress", percent=20, stage="Writing Iceberg metadata"
|
|
980
997
|
)
|
|
981
998
|
|
|
982
999
|
with self._staging() as staging:
|
|
983
|
-
# Define table directory in the staging area
|
|
984
|
-
# Note: IcebergMetadataWriter will create the metadata subdirectory
|
|
985
|
-
table_dir = staging / identifier.path
|
|
986
|
-
table_dir.mkdir(parents=True, exist_ok=True)
|
|
987
|
-
|
|
988
1000
|
# Create table URI for metadata
|
|
989
1001
|
table_uri = self.uri / identifier.path
|
|
990
1002
|
|
|
991
|
-
#
|
|
992
|
-
|
|
993
|
-
table_path=table_dir,
|
|
994
|
-
schema=table_info.schema,
|
|
995
|
-
partition_spec=table_info.partition_spec,
|
|
996
|
-
base_uri=table_uri,
|
|
997
|
-
)
|
|
1003
|
+
# Load FileIO with HuggingFace support
|
|
1004
|
+
io = self._load_file_io(location=str(table_uri))
|
|
998
1005
|
|
|
999
|
-
#
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
properties=
|
|
1007
|
-
|
|
1008
|
-
|
|
1006
|
+
# Write snapshot metadata with split column
|
|
1007
|
+
write_snapshot(
|
|
1008
|
+
files=dataset_info.files,
|
|
1009
|
+
schema=arrow_schema,
|
|
1010
|
+
current_metadata=None,
|
|
1011
|
+
output_dir=staging / identifier.path,
|
|
1012
|
+
base_uri=str(table_uri),
|
|
1013
|
+
properties=properties,
|
|
1014
|
+
include_split_column=True,
|
|
1015
|
+
io=io,
|
|
1009
1016
|
)
|
|
1010
1017
|
|
|
1011
|
-
#
|
|
1012
|
-
# Record all created files in the table directory
|
|
1018
|
+
# Record all created files in the table metadata directory
|
|
1013
1019
|
if progress_callback:
|
|
1014
1020
|
progress_callback(identifier, state="in_progress", percent=90, stage="Finalizing")
|
|
1015
1021
|
|
|
1016
|
-
|
|
1022
|
+
metadata_dir = staging / identifier.path / "metadata"
|
|
1023
|
+
for path in metadata_dir.rglob("*"):
|
|
1017
1024
|
if path.is_file():
|
|
1018
1025
|
staging.add(path.relative_to(staging.path))
|
|
1019
1026
|
|
|
1020
1027
|
# Register table in config if not already there
|
|
1021
1028
|
if identifier not in catalog_config:
|
|
1022
1029
|
catalog_config[identifier] = cfg.Dataset(
|
|
1023
|
-
repo=
|
|
1024
|
-
config=
|
|
1030
|
+
repo=repo,
|
|
1031
|
+
config=config,
|
|
1025
1032
|
)
|
|
1026
1033
|
# Save config since we added a dataset table
|
|
1027
1034
|
catalog_config.to_yaml(staging / "faceberg.yml")
|
|
@@ -1109,16 +1116,17 @@ class BaseCatalog(Catalog):
|
|
|
1109
1116
|
"Please recreate the table to enable incremental sync."
|
|
1110
1117
|
)
|
|
1111
1118
|
|
|
1112
|
-
# Discover dataset at current revision
|
|
1113
|
-
|
|
1119
|
+
# Discover dataset at current revision
|
|
1120
|
+
# Note: The new discover_dataset() doesn't support since_revision filtering yet
|
|
1121
|
+
# So we discover all files and write_snapshot() will handle the diff
|
|
1122
|
+
dataset_info = discover_dataset(
|
|
1114
1123
|
repo_id=table_entry.repo,
|
|
1115
1124
|
config=table_entry.config,
|
|
1116
1125
|
token=self._hf_token,
|
|
1117
|
-
since_revision=old_revision,
|
|
1118
1126
|
)
|
|
1119
1127
|
|
|
1120
|
-
# Check if already up to date (
|
|
1121
|
-
if
|
|
1128
|
+
# Check if already up to date (same revision)
|
|
1129
|
+
if dataset_info.revision == old_revision:
|
|
1122
1130
|
logger.info(f"Table {identifier} already at revision {old_revision}")
|
|
1123
1131
|
if progress_callback:
|
|
1124
1132
|
progress_callback(
|
|
@@ -1126,43 +1134,51 @@ class BaseCatalog(Catalog):
|
|
|
1126
1134
|
)
|
|
1127
1135
|
return table
|
|
1128
1136
|
|
|
1129
|
-
#
|
|
1130
|
-
#
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1137
|
+
# Use existing table schema - don't modify it
|
|
1138
|
+
# The schema was already set correctly when the table was created
|
|
1139
|
+
|
|
1140
|
+
# Build updated properties
|
|
1141
|
+
data_path = (
|
|
1142
|
+
f"hf://datasets/{table_entry.repo}/{dataset_info.data_dir}"
|
|
1143
|
+
if dataset_info.data_dir
|
|
1144
|
+
else f"hf://datasets/{table_entry.repo}"
|
|
1134
1145
|
)
|
|
1135
1146
|
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1147
|
+
properties = {
|
|
1148
|
+
"format-version": "2",
|
|
1149
|
+
"write.parquet.compression-codec": "snappy",
|
|
1150
|
+
"write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
|
|
1151
|
+
"write.data.path": data_path,
|
|
1152
|
+
"hf.dataset.repo": table_entry.repo,
|
|
1153
|
+
"hf.dataset.config": table_entry.config,
|
|
1154
|
+
"hf.dataset.revision": dataset_info.revision,
|
|
1155
|
+
"hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
|
|
1156
|
+
"hf.write.split": "train",
|
|
1157
|
+
}
|
|
1140
1158
|
|
|
1141
|
-
# Append new snapshot with
|
|
1159
|
+
# Append new snapshot with all files (write_snapshot will handle diffing)
|
|
1142
1160
|
with self._staging() as staging:
|
|
1143
|
-
# Create local metadata directory
|
|
1144
|
-
metadata_dir = staging / identifier.path / "metadata"
|
|
1145
|
-
metadata_dir.mkdir(parents=True, exist_ok=True)
|
|
1146
|
-
|
|
1147
1161
|
# Create table URI for metadata
|
|
1148
|
-
table_uri = self.uri / identifier.path
|
|
1149
|
-
|
|
1150
|
-
# Create metadata writer
|
|
1151
|
-
metadata_writer = IcebergMetadataWriter(
|
|
1152
|
-
table_path=metadata_dir,
|
|
1153
|
-
schema=table_info.schema,
|
|
1154
|
-
partition_spec=table_info.partition_spec,
|
|
1155
|
-
base_uri=table_uri,
|
|
1156
|
-
)
|
|
1162
|
+
table_uri = self.uri / identifier.path
|
|
1157
1163
|
|
|
1158
|
-
#
|
|
1159
|
-
|
|
1160
|
-
|
|
1164
|
+
# Load FileIO with HuggingFace support
|
|
1165
|
+
io = self._load_file_io(location=str(table_uri))
|
|
1166
|
+
|
|
1167
|
+
# Write new snapshot (will diff against current_metadata)
|
|
1168
|
+
# Schema and include_split_column parameters are ignored when current_metadata exists
|
|
1169
|
+
# - it uses current_metadata.schema() and current_metadata.spec()
|
|
1170
|
+
write_snapshot(
|
|
1171
|
+
files=dataset_info.files,
|
|
1172
|
+
schema=dataset_info.features.arrow_schema, # Only used if creating new table
|
|
1161
1173
|
current_metadata=table.metadata,
|
|
1162
|
-
|
|
1174
|
+
output_dir=staging / identifier.path,
|
|
1175
|
+
base_uri=str(table_uri),
|
|
1176
|
+
properties=properties,
|
|
1177
|
+
io=io,
|
|
1163
1178
|
)
|
|
1164
1179
|
|
|
1165
|
-
# Record all files in the
|
|
1180
|
+
# Record all files in the metadata directory (including new manifest/metadata files)
|
|
1181
|
+
metadata_dir = staging / identifier.path / "metadata"
|
|
1166
1182
|
for path in metadata_dir.rglob("*"):
|
|
1167
1183
|
if path.is_file():
|
|
1168
1184
|
staging.add(path.relative_to(staging.path))
|