faceberg 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. faceberg-0.1.2/PKG-INFO +149 -0
  2. faceberg-0.1.2/README.md +106 -0
  3. faceberg-0.1.2/faceberg/_version.py +34 -0
  4. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/catalog.py +92 -76
  5. faceberg-0.1.2/faceberg/discover.py +181 -0
  6. faceberg-0.1.2/faceberg/iceberg.py +707 -0
  7. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_catalog.py +1 -2
  8. faceberg-0.1.2/faceberg/tests/test_discover.py +257 -0
  9. faceberg-0.1.2/faceberg/tests/test_iceberg.py +911 -0
  10. {faceberg-0.1.0 → faceberg-0.1.2}/pyproject.toml +9 -3
  11. faceberg-0.1.0/PKG-INFO +0 -175
  12. faceberg-0.1.0/README.md +0 -132
  13. faceberg-0.1.0/faceberg/bridge.py +0 -586
  14. faceberg-0.1.0/faceberg/convert.py +0 -813
  15. faceberg-0.1.0/faceberg/tests/test_bridge.py +0 -825
  16. faceberg-0.1.0/faceberg/tests/test_convert.py +0 -422
  17. {faceberg-0.1.0 → faceberg-0.1.2}/.gitignore +0 -0
  18. {faceberg-0.1.0 → faceberg-0.1.2}/LICENSE +0 -0
  19. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/__init__.py +0 -0
  20. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/cli.py +0 -0
  21. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/config.py +0 -0
  22. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/pretty.py +0 -0
  23. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/server.py +0 -0
  24. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/shell.py +0 -0
  25. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/spaces/Dockerfile +0 -0
  26. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/spaces/README.md +0 -0
  27. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/spaces/landing.html +0 -0
  28. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/__init__.py +0 -0
  29. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/conftest.py +0 -0
  30. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_catalog_duckdb.py +0 -0
  31. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_catalog_pandas.py +0 -0
  32. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_cli.py +0 -0
  33. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_config.py +0 -0
  34. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_pretty.py +0 -0
  35. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_server.py +0 -0
  36. {faceberg-0.1.0 → faceberg-0.1.2}/faceberg/tests/test_server_playwright.py +0 -0
@@ -0,0 +1,149 @@
1
+ Metadata-Version: 2.4
2
+ Name: faceberg
3
+ Version: 0.1.2
4
+ Summary: Bridge HuggingFace datasets with Apache Iceberg
5
+ Project-URL: Homepage, https://github.com/kszucs/faceberg
6
+ Project-URL: Documentation, https://github.com/kszucs/faceberg
7
+ Project-URL: Repository, https://github.com/kszucs/faceberg
8
+ Author-email: Krisztian Szucs <kszucs@users.noreply.github.com>
9
+ License: Apache-2.0
10
+ License-File: LICENSE
11
+ Keywords: data-lake,datasets,huggingface,iceberg
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Requires-Python: >=3.9
21
+ Requires-Dist: click>=8.0.0
22
+ Requires-Dist: datasets>=2.0.0
23
+ Requires-Dist: fsspec>=2023.1.0
24
+ Requires-Dist: huggingface-hub>=0.20.0
25
+ Requires-Dist: jinja2>=3.1.6
26
+ Requires-Dist: litestar>=2.0.0
27
+ Requires-Dist: pyarrow>=21.0.0
28
+ Requires-Dist: pyiceberg>=0.10.0
29
+ Requires-Dist: pyyaml>=6.0
30
+ Requires-Dist: rich>=13.0.0
31
+ Requires-Dist: uuid-utils>=0.9.0
32
+ Requires-Dist: uvicorn[standard]>=0.27.0
33
+ Provides-Extra: dev
34
+ Requires-Dist: black>=23.0.0; extra == 'dev'
35
+ Requires-Dist: duckdb>=0.10.0; extra == 'dev'
36
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
37
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
38
+ Requires-Dist: pytest-playwright>=0.7.0; extra == 'dev'
39
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
40
+ Requires-Dist: requests>=2.31.0; extra == 'dev'
41
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
42
+ Description-Content-Type: text/markdown
43
+
44
+ ![Faceberg](https://github.com/kszucs/faceberg/blob/main/faceberg.png?raw=true)
45
+
46
+ # Faceberg
47
+
48
+ **Bridge HuggingFace datasets with Apache Iceberg tables — no data copying, just metadata.**
49
+
50
+ Faceberg maps HuggingFace datasets to Apache Iceberg tables. Your catalog metadata lives on HuggingFace Spaces with an auto-deployed REST API, and any Iceberg-compatible query engine can access the data.
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ pip install faceberg
56
+ ```
57
+
58
+ ## Quick Start
59
+
60
+ ```bash
61
+ export HF_TOKEN=your_huggingface_token
62
+
63
+ # Create a catalog on HuggingFace Hub
64
+ faceberg user/mycatalog init
65
+
66
+ # Add datasets
67
+ faceberg user/mycatalog add stanfordnlp/imdb --config plain_text
68
+ faceberg user/mycatalog add openai/gsm8k --config main
69
+
70
+ # Query with interactive DuckDB shell
71
+ faceberg user/mycatalog quack
72
+ ```
73
+
74
+ ```sql
75
+ SELECT label, substr(text, 1, 100) as preview
76
+ FROM iceberg_catalog.stanfordnlp.imdb
77
+ LIMIT 10;
78
+ ```
79
+
80
+ ## How It Works
81
+
82
+ ```
83
+ HuggingFace Hub
84
+ ┌─────────────────────────────────────────────────────────┐
85
+ │ │
86
+ │ ┌─────────────────────┐ ┌─────────────────────────┐ │
87
+ │ │ HF Datasets │ │ HF Spaces (Catalog) │ │
88
+ │ │ (Original Parquet) │◄───│ • Iceberg metadata │ │
89
+ │ │ │ │ • REST API endpoint │ │
90
+ │ │ stanfordnlp/imdb/ │ │ • faceberg.yml │ │
91
+ │ │ └── *.parquet │ │ │ │
92
+ │ └─────────────────────┘ └───────────┬─────────────┘ │
93
+ │ │ │
94
+ └─────────────────────────────────────────┼───────────────┘
95
+ │ Iceberg REST API
96
+
97
+ ┌─────────────────────────┐
98
+ │ Query Engines │
99
+ │ DuckDB, Pandas, Spark │
100
+ └─────────────────────────┘
101
+ ```
102
+
103
+ **No data is copied** — only metadata is created. Query with DuckDB, PyIceberg, Spark, or any Iceberg-compatible tool.
104
+
105
+ ## Python API
106
+
107
+ ```python
108
+ import os
109
+ from faceberg import catalog
110
+
111
+ cat = catalog("user/mycatalog", hf_token=os.environ.get("HF_TOKEN"))
112
+ table = cat.load_table("stanfordnlp.imdb")
113
+ df = table.scan(limit=100).to_pandas()
114
+ ```
115
+
116
+ ## Share Your Catalog
117
+
118
+ Your catalog is accessible to anyone via the REST API:
119
+
120
+ ```python
121
+ import duckdb
122
+
123
+ conn = duckdb.connect()
124
+ conn.execute("INSTALL iceberg; LOAD iceberg")
125
+ conn.execute("ATTACH 'https://user-mycatalog.hf.space' AS cat (TYPE ICEBERG)")
126
+
127
+ result = conn.execute("SELECT * FROM cat.stanfordnlp.imdb LIMIT 5").fetchdf()
128
+ ```
129
+
130
+ ## Documentation
131
+
132
+ **[Read the docs →](https://faceberg.kszucs.dev/)**
133
+
134
+ - [Getting Started](https://faceberg.kszucs.dev/) — Full quickstart guide
135
+ - [Local Catalogs](https://faceberg.kszucs.dev/local.html) — Use local catalogs for development
136
+ - [DuckDB Integration](https://faceberg.kszucs.dev/integrations/duckdb.html) — Advanced SQL queries
137
+ - [Pandas Integration](https://faceberg.kszucs.dev/integrations/pandas.html) — Load into DataFrames
138
+
139
+ ## Development
140
+
141
+ ```bash
142
+ git clone https://github.com/kszucs/faceberg
143
+ cd faceberg
144
+ pip install -e .
145
+ ```
146
+
147
+ ## License
148
+
149
+ Apache 2.0
@@ -0,0 +1,106 @@
1
+ ![Faceberg](https://github.com/kszucs/faceberg/blob/main/faceberg.png?raw=true)
2
+
3
+ # Faceberg
4
+
5
+ **Bridge HuggingFace datasets with Apache Iceberg tables — no data copying, just metadata.**
6
+
7
+ Faceberg maps HuggingFace datasets to Apache Iceberg tables. Your catalog metadata lives on HuggingFace Spaces with an auto-deployed REST API, and any Iceberg-compatible query engine can access the data.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ pip install faceberg
13
+ ```
14
+
15
+ ## Quick Start
16
+
17
+ ```bash
18
+ export HF_TOKEN=your_huggingface_token
19
+
20
+ # Create a catalog on HuggingFace Hub
21
+ faceberg user/mycatalog init
22
+
23
+ # Add datasets
24
+ faceberg user/mycatalog add stanfordnlp/imdb --config plain_text
25
+ faceberg user/mycatalog add openai/gsm8k --config main
26
+
27
+ # Query with interactive DuckDB shell
28
+ faceberg user/mycatalog quack
29
+ ```
30
+
31
+ ```sql
32
+ SELECT label, substr(text, 1, 100) as preview
33
+ FROM iceberg_catalog.stanfordnlp.imdb
34
+ LIMIT 10;
35
+ ```
36
+
37
+ ## How It Works
38
+
39
+ ```
40
+ HuggingFace Hub
41
+ ┌─────────────────────────────────────────────────────────┐
42
+ │ │
43
+ │ ┌─────────────────────┐ ┌─────────────────────────┐ │
44
+ │ │ HF Datasets │ │ HF Spaces (Catalog) │ │
45
+ │ │ (Original Parquet) │◄───│ • Iceberg metadata │ │
46
+ │ │ │ │ • REST API endpoint │ │
47
+ │ │ stanfordnlp/imdb/ │ │ • faceberg.yml │ │
48
+ │ │ └── *.parquet │ │ │ │
49
+ │ └─────────────────────┘ └───────────┬─────────────┘ │
50
+ │ │ │
51
+ └─────────────────────────────────────────┼───────────────┘
52
+ │ Iceberg REST API
53
+
54
+ ┌─────────────────────────┐
55
+ │ Query Engines │
56
+ │ DuckDB, Pandas, Spark │
57
+ └─────────────────────────┘
58
+ ```
59
+
60
+ **No data is copied** — only metadata is created. Query with DuckDB, PyIceberg, Spark, or any Iceberg-compatible tool.
61
+
62
+ ## Python API
63
+
64
+ ```python
65
+ import os
66
+ from faceberg import catalog
67
+
68
+ cat = catalog("user/mycatalog", hf_token=os.environ.get("HF_TOKEN"))
69
+ table = cat.load_table("stanfordnlp.imdb")
70
+ df = table.scan(limit=100).to_pandas()
71
+ ```
72
+
73
+ ## Share Your Catalog
74
+
75
+ Your catalog is accessible to anyone via the REST API:
76
+
77
+ ```python
78
+ import duckdb
79
+
80
+ conn = duckdb.connect()
81
+ conn.execute("INSTALL iceberg; LOAD iceberg")
82
+ conn.execute("ATTACH 'https://user-mycatalog.hf.space' AS cat (TYPE ICEBERG)")
83
+
84
+ result = conn.execute("SELECT * FROM cat.stanfordnlp.imdb LIMIT 5").fetchdf()
85
+ ```
86
+
87
+ ## Documentation
88
+
89
+ **[Read the docs →](https://faceberg.kszucs.dev/)**
90
+
91
+ - [Getting Started](https://faceberg.kszucs.dev/) — Full quickstart guide
92
+ - [Local Catalogs](https://faceberg.kszucs.dev/local.html) — Use local catalogs for development
93
+ - [DuckDB Integration](https://faceberg.kszucs.dev/integrations/duckdb.html) — Advanced SQL queries
94
+ - [Pandas Integration](https://faceberg.kszucs.dev/integrations/pandas.html) — Load into DataFrames
95
+
96
+ ## Development
97
+
98
+ ```bash
99
+ git clone https://github.com/kszucs/faceberg
100
+ cd faceberg
101
+ pip install -e .
102
+ ```
103
+
104
+ ## License
105
+
106
+ Apache 2.0
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.1.2'
32
+ __version_tuple__ = version_tuple = (0, 1, 2)
33
+
34
+ __commit_id__ = commit_id = None
@@ -4,7 +4,6 @@ import logging
4
4
  import os
5
5
  import shutil
6
6
  import tempfile
7
- import uuid
8
7
  from contextlib import contextmanager
9
8
  from pathlib import Path
10
9
  from typing import TYPE_CHECKING, Any, Callable, List, Optional, Set, Union
@@ -20,7 +19,7 @@ from pyiceberg.exceptions import (
20
19
  NoSuchTableError,
21
20
  TableAlreadyExistsError,
22
21
  )
23
- from pyiceberg.io import FileIO
22
+ from pyiceberg.io import FileIO, load_file_io
24
23
  from pyiceberg.io.fsspec import FsspecFileIO
25
24
  from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionKey, PartitionSpec
26
25
  from pyiceberg.schema import Schema
@@ -34,8 +33,8 @@ from pyiceberg.typedef import EMPTY_DICT, Properties
34
33
  from uuid_utils import uuid7
35
34
 
36
35
  from . import config as cfg
37
- from .bridge import DatasetInfo
38
- from .convert import IcebergMetadataWriter
36
+ from .discover import discover_dataset
37
+ from .iceberg import write_snapshot
39
38
 
40
39
  if TYPE_CHECKING:
41
40
  import pyarrow as pa
@@ -361,8 +360,6 @@ class BaseCatalog(Catalog):
361
360
  Returns:
362
361
  FileIO instance with authentication configured
363
362
  """
364
- from pyiceberg.io import load_file_io
365
-
366
363
  # Start with catalog's persisted properties
367
364
  props = dict(self.properties)
368
365
  # Add runtime-only token if available
@@ -956,72 +953,82 @@ class BaseCatalog(Catalog):
956
953
  identifier, state="in_progress", percent=0, stage="Discovering dataset"
957
954
  )
958
955
 
959
- dataset_info = DatasetInfo.discover(
956
+ dataset_info = discover_dataset(
960
957
  repo_id=repo,
961
958
  config=config,
962
959
  token=self._hf_token,
963
960
  )
964
961
 
965
- # Convert to TableInfo
962
+ # Prepare schema with split column
966
963
  if progress_callback:
967
- progress_callback(identifier, state="in_progress", percent=0, stage="Converting schema")
964
+ progress_callback(
965
+ identifier, state="in_progress", percent=10, stage="Converting schema"
966
+ )
967
+
968
+ if not dataset_info.files:
969
+ raise ValueError(f"No Parquet files found in dataset {repo}")
968
970
 
969
- # TODO(kszucs): support nested namespace, pass identifier to to_table_info
970
- namespace, table_name = identifier
971
- table_info = dataset_info.to_table_info(
972
- namespace=namespace,
973
- table_name=table_name,
971
+ # Convert HuggingFace features to Arrow schema
972
+ arrow_schema = dataset_info.features.arrow_schema
973
+
974
+ # Build table properties
975
+ data_path = (
976
+ f"hf://datasets/{repo}/{dataset_info.data_dir}"
977
+ if dataset_info.data_dir
978
+ else f"hf://datasets/{repo}"
974
979
  )
975
980
 
976
- # Create the table with full metadata in staging context
981
+ properties = {
982
+ "format-version": "2",
983
+ "write.parquet.compression-codec": "snappy",
984
+ "write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
985
+ "write.data.path": data_path,
986
+ "hf.dataset.repo": repo,
987
+ "hf.dataset.config": config,
988
+ "hf.dataset.revision": dataset_info.revision,
989
+ "hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
990
+ "hf.write.split": "train",
991
+ }
992
+
993
+ # Write Iceberg metadata
977
994
  if progress_callback:
978
995
  progress_callback(
979
- identifier, state="in_progress", percent=0, stage="Writing Iceberg metadata"
996
+ identifier, state="in_progress", percent=20, stage="Writing Iceberg metadata"
980
997
  )
981
998
 
982
999
  with self._staging() as staging:
983
- # Define table directory in the staging area
984
- # Note: IcebergMetadataWriter will create the metadata subdirectory
985
- table_dir = staging / identifier.path
986
- table_dir.mkdir(parents=True, exist_ok=True)
987
-
988
1000
  # Create table URI for metadata
989
1001
  table_uri = self.uri / identifier.path
990
1002
 
991
- # Create metadata writer
992
- metadata_writer = IcebergMetadataWriter(
993
- table_path=table_dir,
994
- schema=table_info.schema,
995
- partition_spec=table_info.partition_spec,
996
- base_uri=table_uri,
997
- )
1003
+ # Load FileIO with HuggingFace support
1004
+ io = self._load_file_io(location=str(table_uri))
998
1005
 
999
- # Generate table UUID
1000
- table_uuid = str(uuid.uuid4())
1001
-
1002
- # Write Iceberg metadata files (manifest, manifest list, table metadata)
1003
- metadata_writer.create_metadata_from_files(
1004
- file_infos=table_info.data_files,
1005
- table_uuid=table_uuid,
1006
- properties=table_info.get_table_properties(),
1007
- progress_callback=progress_callback,
1008
- identifier=identifier,
1006
+ # Write snapshot metadata with split column
1007
+ write_snapshot(
1008
+ files=dataset_info.files,
1009
+ schema=arrow_schema,
1010
+ current_metadata=None,
1011
+ output_dir=staging / identifier.path,
1012
+ base_uri=str(table_uri),
1013
+ properties=properties,
1014
+ include_split_column=True,
1015
+ io=io,
1009
1016
  )
1010
1017
 
1011
- # TODO(kszucs): metadata writer should return with the affected file paths
1012
- # Record all created files in the table directory
1018
+ # Record all created files in the table metadata directory
1013
1019
  if progress_callback:
1014
1020
  progress_callback(identifier, state="in_progress", percent=90, stage="Finalizing")
1015
1021
 
1016
- for path in table_dir.rglob("*"):
1022
+ metadata_dir = staging / identifier.path / "metadata"
1023
+ for path in metadata_dir.rglob("*"):
1017
1024
  if path.is_file():
1018
1025
  staging.add(path.relative_to(staging.path))
1019
1026
 
1020
1027
  # Register table in config if not already there
1021
1028
  if identifier not in catalog_config:
1022
1029
  catalog_config[identifier] = cfg.Dataset(
1023
- repo=table_info.dataset_repo,
1024
- config=table_info.dataset_config,
1030
+ repo=repo,
1031
+ config=config,
1025
1032
  )
1026
1033
  # Save config since we added a dataset table
1027
1034
  catalog_config.to_yaml(staging / "faceberg.yml")
@@ -1109,16 +1116,17 @@ class BaseCatalog(Catalog):
1109
1116
  "Please recreate the table to enable incremental sync."
1110
1117
  )
1111
1118
 
1112
- # Discover dataset at current revision with only new files since old_revision
1113
- dataset_info = DatasetInfo.discover(
1119
+ # Discover dataset at current revision
1120
+ # Note: The new discover_dataset() doesn't support since_revision filtering yet
1121
+ # So we discover all files and write_snapshot() will handle the diff
1122
+ dataset_info = discover_dataset(
1114
1123
  repo_id=table_entry.repo,
1115
1124
  config=table_entry.config,
1116
1125
  token=self._hf_token,
1117
- since_revision=old_revision,
1118
1126
  )
1119
1127
 
1120
- # Check if already up to date (no new files)
1121
- if not dataset_info.data_files:
1128
+ # Check if already up to date (same revision)
1129
+ if dataset_info.revision == old_revision:
1122
1130
  logger.info(f"Table {identifier} already at revision {old_revision}")
1123
1131
  if progress_callback:
1124
1132
  progress_callback(
@@ -1126,43 +1134,51 @@ class BaseCatalog(Catalog):
1126
1134
  )
1127
1135
  return table
1128
1136
 
1129
- # Convert to TableInfo with only new files
1130
- # TODO(kszucs): support nested namespace, pass identifier to to_table_info
1131
- table_info = dataset_info.to_table_info(
1132
- namespace=identifier[0],
1133
- table_name=identifier[1],
1137
+ # Use existing table schema - don't modify it
1138
+ # The schema was already set correctly when the table was created
1139
+
1140
+ # Build updated properties
1141
+ data_path = (
1142
+ f"hf://datasets/{table_entry.repo}/{dataset_info.data_dir}"
1143
+ if dataset_info.data_dir
1144
+ else f"hf://datasets/{table_entry.repo}"
1134
1145
  )
1135
1146
 
1136
- # If no new files, table is already up to date
1137
- if not table_info.data_files:
1138
- logger.info(f"No new files for {identifier}")
1139
- return table
1147
+ properties = {
1148
+ "format-version": "2",
1149
+ "write.parquet.compression-codec": "snappy",
1150
+ "write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
1151
+ "write.data.path": data_path,
1152
+ "hf.dataset.repo": table_entry.repo,
1153
+ "hf.dataset.config": table_entry.config,
1154
+ "hf.dataset.revision": dataset_info.revision,
1155
+ "hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
1156
+ "hf.write.split": "train",
1157
+ }
1140
1158
 
1141
- # Append new snapshot with only new files
1159
+ # Append new snapshot with all files (write_snapshot will handle diffing)
1142
1160
  with self._staging() as staging:
1143
- # Create local metadata directory
1144
- metadata_dir = staging / identifier.path / "metadata"
1145
- metadata_dir.mkdir(parents=True, exist_ok=True)
1146
-
1147
1161
  # Create table URI for metadata
1148
- table_uri = self.uri / identifier.path.path
1149
-
1150
- # Create metadata writer
1151
- metadata_writer = IcebergMetadataWriter(
1152
- table_path=metadata_dir,
1153
- schema=table_info.schema,
1154
- partition_spec=table_info.partition_spec,
1155
- base_uri=table_uri,
1156
- )
1162
+ table_uri = self.uri / identifier.path
1157
1163
 
1158
- # Append new snapshot with updated files
1159
- metadata_writer.append_snapshot_from_files(
1160
- file_infos=table_info.data_files,
1164
+ # Load FileIO with HuggingFace support
1165
+ io = self._load_file_io(location=str(table_uri))
1166
+
1167
+ # Write new snapshot (will diff against current_metadata)
1168
+ # Schema and include_split_column parameters are ignored when current_metadata exists
1169
+ # - it uses current_metadata.schema() and current_metadata.spec()
1170
+ write_snapshot(
1171
+ files=dataset_info.files,
1172
+ schema=dataset_info.features.arrow_schema, # Only used if creating new table
1161
1173
  current_metadata=table.metadata,
1162
- properties=table_info.get_table_properties(),
1174
+ output_dir=staging / identifier.path,
1175
+ base_uri=str(table_uri),
1176
+ properties=properties,
1177
+ io=io,
1163
1178
  )
1164
1179
 
1165
- # Record all files in the table directory (including new manifest/metadata files)
1180
+ # Record all files in the metadata directory (including new manifest/metadata files)
1181
+ metadata_dir = staging / identifier.path / "metadata"
1166
1182
  for path in metadata_dir.rglob("*"):
1167
1183
  if path.is_file():
1168
1184
  staging.add(path.relative_to(staging.path))