faceberg 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {faceberg-0.1.1 → faceberg-0.1.3}/PKG-INFO +9 -7
  2. {faceberg-0.1.1 → faceberg-0.1.3}/README.md +7 -5
  3. faceberg-0.1.3/faceberg/_version.py +34 -0
  4. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/catalog.py +92 -76
  5. faceberg-0.1.3/faceberg/discover.py +181 -0
  6. faceberg-0.1.3/faceberg/iceberg.py +707 -0
  7. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/tests/test_catalog.py +1 -2
  8. faceberg-0.1.3/faceberg/tests/test_discover.py +257 -0
  9. faceberg-0.1.3/faceberg/tests/test_iceberg.py +911 -0
  10. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/tests/test_server_playwright.py +5 -1
  11. {faceberg-0.1.1 → faceberg-0.1.3}/pyproject.toml +10 -3
  12. faceberg-0.1.1/faceberg/bridge.py +0 -586
  13. faceberg-0.1.1/faceberg/convert.py +0 -813
  14. faceberg-0.1.1/faceberg/tests/test_bridge.py +0 -825
  15. faceberg-0.1.1/faceberg/tests/test_convert.py +0 -422
  16. {faceberg-0.1.1 → faceberg-0.1.3}/.gitignore +0 -0
  17. {faceberg-0.1.1 → faceberg-0.1.3}/LICENSE +0 -0
  18. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/__init__.py +0 -0
  19. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/cli.py +0 -0
  20. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/config.py +0 -0
  21. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/pretty.py +0 -0
  22. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/server.py +0 -0
  23. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/shell.py +0 -0
  24. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/spaces/Dockerfile +0 -0
  25. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/spaces/README.md +0 -0
  26. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/spaces/landing.html +0 -0
  27. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/tests/__init__.py +0 -0
  28. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/tests/conftest.py +0 -0
  29. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/tests/test_catalog_duckdb.py +0 -0
  30. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/tests/test_catalog_pandas.py +0 -0
  31. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/tests/test_cli.py +0 -0
  32. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/tests/test_config.py +0 -0
  33. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/tests/test_pretty.py +0 -0
  34. {faceberg-0.1.1 → faceberg-0.1.3}/faceberg/tests/test_server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: faceberg
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Bridge HuggingFace datasets with Apache Iceberg
5
5
  Project-URL: Homepage, https://github.com/kszucs/faceberg
6
6
  Project-URL: Documentation, https://github.com/kszucs/faceberg
@@ -25,7 +25,7 @@ Requires-Dist: huggingface-hub>=0.20.0
25
25
  Requires-Dist: jinja2>=3.1.6
26
26
  Requires-Dist: litestar>=2.0.0
27
27
  Requires-Dist: pyarrow>=21.0.0
28
- Requires-Dist: pyiceberg>=0.6.0
28
+ Requires-Dist: pyiceberg>=0.10.0
29
29
  Requires-Dist: pyyaml>=6.0
30
30
  Requires-Dist: rich>=13.0.0
31
31
  Requires-Dist: uuid-utils>=0.9.0
@@ -82,7 +82,7 @@ LIMIT 10;
82
82
  ```
83
83
  HuggingFace Hub
84
84
  ┌─────────────────────────────────────────────────────────┐
85
-
85
+
86
86
  │ ┌─────────────────────┐ ┌─────────────────────────┐ │
87
87
  │ │ HF Datasets │ │ HF Spaces (Catalog) │ │
88
88
  │ │ (Original Parquet) │◄───│ • Iceberg metadata │ │
@@ -129,10 +129,12 @@ result = conn.execute("SELECT * FROM cat.stanfordnlp.imdb LIMIT 5").fetchdf()
129
129
 
130
130
  ## Documentation
131
131
 
132
- - [Getting Started](docs/index.qmd) — Full quickstart guide
133
- - [Local Catalogs](docs/local.qmd) — Use local catalogs for development
134
- - [DuckDB Integration](docs/integrations/duckdb.qmd) — Advanced SQL queries
135
- - [Pandas Integration](docs/integrations/pandas.qmd) — Load into DataFrames
132
+ **[Read the docs →](https://faceberg.kszucs.dev/)**
133
+
134
+ - [Getting Started](https://faceberg.kszucs.dev/) — Full quickstart guide
135
+ - [Local Catalogs](https://faceberg.kszucs.dev/local.html) — Use local catalogs for development
136
+ - [DuckDB Integration](https://faceberg.kszucs.dev/integrations/duckdb.html) — Advanced SQL queries
137
+ - [Pandas Integration](https://faceberg.kszucs.dev/integrations/pandas.html) — Load into DataFrames
136
138
 
137
139
  ## Development
138
140
 
@@ -39,7 +39,7 @@ LIMIT 10;
39
39
  ```
40
40
  HuggingFace Hub
41
41
  ┌─────────────────────────────────────────────────────────┐
42
-
42
+
43
43
  │ ┌─────────────────────┐ ┌─────────────────────────┐ │
44
44
  │ │ HF Datasets │ │ HF Spaces (Catalog) │ │
45
45
  │ │ (Original Parquet) │◄───│ • Iceberg metadata │ │
@@ -86,10 +86,12 @@ result = conn.execute("SELECT * FROM cat.stanfordnlp.imdb LIMIT 5").fetchdf()
86
86
 
87
87
  ## Documentation
88
88
 
89
- - [Getting Started](docs/index.qmd) — Full quickstart guide
90
- - [Local Catalogs](docs/local.qmd) — Use local catalogs for development
91
- - [DuckDB Integration](docs/integrations/duckdb.qmd) — Advanced SQL queries
92
- - [Pandas Integration](docs/integrations/pandas.qmd) — Load into DataFrames
89
+ **[Read the docs →](https://faceberg.kszucs.dev/)**
90
+
91
+ - [Getting Started](https://faceberg.kszucs.dev/) — Full quickstart guide
92
+ - [Local Catalogs](https://faceberg.kszucs.dev/local.html) — Use local catalogs for development
93
+ - [DuckDB Integration](https://faceberg.kszucs.dev/integrations/duckdb.html) — Advanced SQL queries
94
+ - [Pandas Integration](https://faceberg.kszucs.dev/integrations/pandas.html) — Load into DataFrames
93
95
 
94
96
  ## Development
95
97
 
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.1.3'
32
+ __version_tuple__ = version_tuple = (0, 1, 3)
33
+
34
+ __commit_id__ = commit_id = None
@@ -4,7 +4,6 @@ import logging
4
4
  import os
5
5
  import shutil
6
6
  import tempfile
7
- import uuid
8
7
  from contextlib import contextmanager
9
8
  from pathlib import Path
10
9
  from typing import TYPE_CHECKING, Any, Callable, List, Optional, Set, Union
@@ -20,7 +19,7 @@ from pyiceberg.exceptions import (
20
19
  NoSuchTableError,
21
20
  TableAlreadyExistsError,
22
21
  )
23
- from pyiceberg.io import FileIO
22
+ from pyiceberg.io import FileIO, load_file_io
24
23
  from pyiceberg.io.fsspec import FsspecFileIO
25
24
  from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionKey, PartitionSpec
26
25
  from pyiceberg.schema import Schema
@@ -34,8 +33,8 @@ from pyiceberg.typedef import EMPTY_DICT, Properties
34
33
  from uuid_utils import uuid7
35
34
 
36
35
  from . import config as cfg
37
- from .bridge import DatasetInfo
38
- from .convert import IcebergMetadataWriter
36
+ from .discover import discover_dataset
37
+ from .iceberg import write_snapshot
39
38
 
40
39
  if TYPE_CHECKING:
41
40
  import pyarrow as pa
@@ -361,8 +360,6 @@ class BaseCatalog(Catalog):
361
360
  Returns:
362
361
  FileIO instance with authentication configured
363
362
  """
364
- from pyiceberg.io import load_file_io
365
-
366
363
  # Start with catalog's persisted properties
367
364
  props = dict(self.properties)
368
365
  # Add runtime-only token if available
@@ -956,72 +953,82 @@ class BaseCatalog(Catalog):
956
953
  identifier, state="in_progress", percent=0, stage="Discovering dataset"
957
954
  )
958
955
 
959
- dataset_info = DatasetInfo.discover(
956
+ dataset_info = discover_dataset(
960
957
  repo_id=repo,
961
958
  config=config,
962
959
  token=self._hf_token,
963
960
  )
964
961
 
965
- # Convert to TableInfo
962
+ # Prepare schema with split column
966
963
  if progress_callback:
967
- progress_callback(identifier, state="in_progress", percent=0, stage="Converting schema")
964
+ progress_callback(
965
+ identifier, state="in_progress", percent=10, stage="Converting schema"
966
+ )
967
+
968
+ if not dataset_info.files:
969
+ raise ValueError(f"No Parquet files found in dataset {repo}")
968
970
 
969
- # TODO(kszucs): support nested namespace, pass identifier to to_table_info
970
- namespace, table_name = identifier
971
- table_info = dataset_info.to_table_info(
972
- namespace=namespace,
973
- table_name=table_name,
971
+ # Convert HuggingFace features to Arrow schema
972
+ arrow_schema = dataset_info.features.arrow_schema
973
+
974
+ # Build table properties
975
+ data_path = (
976
+ f"hf://datasets/{repo}/{dataset_info.data_dir}"
977
+ if dataset_info.data_dir
978
+ else f"hf://datasets/{repo}"
974
979
  )
975
980
 
976
- # Create the table with full metadata in staging context
981
+ properties = {
982
+ "format-version": "2",
983
+ "write.parquet.compression-codec": "snappy",
984
+ "write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
985
+ "write.data.path": data_path,
986
+ "hf.dataset.repo": repo,
987
+ "hf.dataset.config": config,
988
+ "hf.dataset.revision": dataset_info.revision,
989
+ "hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
990
+ "hf.write.split": "train",
991
+ }
992
+
993
+ # Write Iceberg metadata
977
994
  if progress_callback:
978
995
  progress_callback(
979
- identifier, state="in_progress", percent=0, stage="Writing Iceberg metadata"
996
+ identifier, state="in_progress", percent=20, stage="Writing Iceberg metadata"
980
997
  )
981
998
 
982
999
  with self._staging() as staging:
983
- # Define table directory in the staging area
984
- # Note: IcebergMetadataWriter will create the metadata subdirectory
985
- table_dir = staging / identifier.path
986
- table_dir.mkdir(parents=True, exist_ok=True)
987
-
988
1000
  # Create table URI for metadata
989
1001
  table_uri = self.uri / identifier.path
990
1002
 
991
- # Create metadata writer
992
- metadata_writer = IcebergMetadataWriter(
993
- table_path=table_dir,
994
- schema=table_info.schema,
995
- partition_spec=table_info.partition_spec,
996
- base_uri=table_uri,
997
- )
1003
+ # Load FileIO with HuggingFace support
1004
+ io = self._load_file_io(location=str(table_uri))
998
1005
 
999
- # Generate table UUID
1000
- table_uuid = str(uuid.uuid4())
1001
-
1002
- # Write Iceberg metadata files (manifest, manifest list, table metadata)
1003
- metadata_writer.create_metadata_from_files(
1004
- file_infos=table_info.data_files,
1005
- table_uuid=table_uuid,
1006
- properties=table_info.get_table_properties(),
1007
- progress_callback=progress_callback,
1008
- identifier=identifier,
1006
+ # Write snapshot metadata with split column
1007
+ write_snapshot(
1008
+ files=dataset_info.files,
1009
+ schema=arrow_schema,
1010
+ current_metadata=None,
1011
+ output_dir=staging / identifier.path,
1012
+ base_uri=str(table_uri),
1013
+ properties=properties,
1014
+ include_split_column=True,
1015
+ io=io,
1009
1016
  )
1010
1017
 
1011
- # TODO(kszucs): metadata writer should return with the affected file paths
1012
- # Record all created files in the table directory
1018
+ # Record all created files in the table metadata directory
1013
1019
  if progress_callback:
1014
1020
  progress_callback(identifier, state="in_progress", percent=90, stage="Finalizing")
1015
1021
 
1016
- for path in table_dir.rglob("*"):
1022
+ metadata_dir = staging / identifier.path / "metadata"
1023
+ for path in metadata_dir.rglob("*"):
1017
1024
  if path.is_file():
1018
1025
  staging.add(path.relative_to(staging.path))
1019
1026
 
1020
1027
  # Register table in config if not already there
1021
1028
  if identifier not in catalog_config:
1022
1029
  catalog_config[identifier] = cfg.Dataset(
1023
- repo=table_info.dataset_repo,
1024
- config=table_info.dataset_config,
1030
+ repo=repo,
1031
+ config=config,
1025
1032
  )
1026
1033
  # Save config since we added a dataset table
1027
1034
  catalog_config.to_yaml(staging / "faceberg.yml")
@@ -1109,16 +1116,17 @@ class BaseCatalog(Catalog):
1109
1116
  "Please recreate the table to enable incremental sync."
1110
1117
  )
1111
1118
 
1112
- # Discover dataset at current revision with only new files since old_revision
1113
- dataset_info = DatasetInfo.discover(
1119
+ # Discover dataset at current revision
1120
+ # Note: The new discover_dataset() doesn't support since_revision filtering yet
1121
+ # So we discover all files and write_snapshot() will handle the diff
1122
+ dataset_info = discover_dataset(
1114
1123
  repo_id=table_entry.repo,
1115
1124
  config=table_entry.config,
1116
1125
  token=self._hf_token,
1117
- since_revision=old_revision,
1118
1126
  )
1119
1127
 
1120
- # Check if already up to date (no new files)
1121
- if not dataset_info.data_files:
1128
+ # Check if already up to date (same revision)
1129
+ if dataset_info.revision == old_revision:
1122
1130
  logger.info(f"Table {identifier} already at revision {old_revision}")
1123
1131
  if progress_callback:
1124
1132
  progress_callback(
@@ -1126,43 +1134,51 @@ class BaseCatalog(Catalog):
1126
1134
  )
1127
1135
  return table
1128
1136
 
1129
- # Convert to TableInfo with only new files
1130
- # TODO(kszucs): support nested namespace, pass identifier to to_table_info
1131
- table_info = dataset_info.to_table_info(
1132
- namespace=identifier[0],
1133
- table_name=identifier[1],
1137
+ # Use existing table schema - don't modify it
1138
+ # The schema was already set correctly when the table was created
1139
+
1140
+ # Build updated properties
1141
+ data_path = (
1142
+ f"hf://datasets/{table_entry.repo}/{dataset_info.data_dir}"
1143
+ if dataset_info.data_dir
1144
+ else f"hf://datasets/{table_entry.repo}"
1134
1145
  )
1135
1146
 
1136
- # If no new files, table is already up to date
1137
- if not table_info.data_files:
1138
- logger.info(f"No new files for {identifier}")
1139
- return table
1147
+ properties = {
1148
+ "format-version": "2",
1149
+ "write.parquet.compression-codec": "snappy",
1150
+ "write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
1151
+ "write.data.path": data_path,
1152
+ "hf.dataset.repo": table_entry.repo,
1153
+ "hf.dataset.config": table_entry.config,
1154
+ "hf.dataset.revision": dataset_info.revision,
1155
+ "hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
1156
+ "hf.write.split": "train",
1157
+ }
1140
1158
 
1141
- # Append new snapshot with only new files
1159
+ # Append new snapshot with all files (write_snapshot will handle diffing)
1142
1160
  with self._staging() as staging:
1143
- # Create local metadata directory
1144
- metadata_dir = staging / identifier.path / "metadata"
1145
- metadata_dir.mkdir(parents=True, exist_ok=True)
1146
-
1147
1161
  # Create table URI for metadata
1148
- table_uri = self.uri / identifier.path.path
1149
-
1150
- # Create metadata writer
1151
- metadata_writer = IcebergMetadataWriter(
1152
- table_path=metadata_dir,
1153
- schema=table_info.schema,
1154
- partition_spec=table_info.partition_spec,
1155
- base_uri=table_uri,
1156
- )
1162
+ table_uri = self.uri / identifier.path
1157
1163
 
1158
- # Append new snapshot with updated files
1159
- metadata_writer.append_snapshot_from_files(
1160
- file_infos=table_info.data_files,
1164
+ # Load FileIO with HuggingFace support
1165
+ io = self._load_file_io(location=str(table_uri))
1166
+
1167
+ # Write new snapshot (will diff against current_metadata)
1168
+ # Schema and include_split_column parameters are ignored when current_metadata exists
1169
+ # - it uses current_metadata.schema() and current_metadata.spec()
1170
+ write_snapshot(
1171
+ files=dataset_info.files,
1172
+ schema=dataset_info.features.arrow_schema, # Only used if creating new table
1161
1173
  current_metadata=table.metadata,
1162
- properties=table_info.get_table_properties(),
1174
+ output_dir=staging / identifier.path,
1175
+ base_uri=str(table_uri),
1176
+ properties=properties,
1177
+ io=io,
1163
1178
  )
1164
1179
 
1165
- # Record all files in the table directory (including new manifest/metadata files)
1180
+ # Record all files in the metadata directory (including new manifest/metadata files)
1181
+ metadata_dir = staging / identifier.path / "metadata"
1166
1182
  for path in metadata_dir.rglob("*"):
1167
1183
  if path.is_file():
1168
1184
  staging.add(path.relative_to(staging.path))
@@ -0,0 +1,181 @@
1
+ """HuggingFace dataset discovery.
2
+
3
+ This module discovers HuggingFace datasets and extracts metadata without
4
+ any Iceberg-specific conversions. It provides the foundation for converting
5
+ datasets to Iceberg tables.
6
+ """
7
+
8
+ import os
9
+ import tempfile
10
+ from dataclasses import dataclass
11
+ from typing import List, Optional
12
+
13
+ from datasets import Features, load_dataset_builder
14
+ from huggingface_hub import HfApi
15
+
16
+
17
+ def dataset_builder_safe(
18
+ repo_id: str,
19
+ config: str,
20
+ token: Optional[str] = None,
21
+ ):
22
+ """Load dataset builder while avoiding picking up local files.
23
+
24
+ Changes to a temporary directory before loading to ensure the datasets
25
+ library doesn't pick up local files in the current directory.
26
+
27
+ Args:
28
+ repo_id: HuggingFace dataset repository ID
29
+ config: Configuration name
30
+ token: Optional HuggingFace API token
31
+
32
+ Returns:
33
+ Dataset builder object
34
+
35
+ Raises:
36
+ Exception: If loading fails
37
+ """
38
+ original_cwd = os.getcwd()
39
+
40
+ try:
41
+ # Change to a temporary directory to avoid dataset library picking up local files
42
+ with tempfile.TemporaryDirectory() as tmpdir:
43
+ os.chdir(tmpdir)
44
+ return load_dataset_builder(repo_id, config, token=token)
45
+ finally:
46
+ # Always restore the original directory
47
+ os.chdir(original_cwd)
48
+
49
+
50
+ @dataclass
51
+ class ParquetFile:
52
+ """A Parquet file discovered in a HuggingFace dataset.
53
+
54
+ Attributes:
55
+ uri: Full hf:// URI with revision (e.g., hf://datasets/repo@sha/file.parquet)
56
+ path: File path within the dataset (e.g., data/train-00000.parquet)
57
+ size: File size in bytes
58
+ blob_id: Git blob ID (oid) from HuggingFace
59
+ split: Optional split name (train, test, validation, etc.)
60
+ """
61
+
62
+ uri: str
63
+ path: str
64
+ size: int
65
+ blob_id: str
66
+ split: Optional[str] = None
67
+
68
+
69
+ @dataclass
70
+ class DatasetInfo:
71
+ """Complete information about a discovered HuggingFace dataset.
72
+
73
+ This represents the result of dataset discovery, containing all metadata
74
+ needed to understand the dataset structure without any Iceberg conversions.
75
+
76
+ Attributes:
77
+ repo_id: HuggingFace repository ID (e.g., "squad")
78
+ config: Configuration name
79
+ revision: Git revision SHA
80
+ features: HuggingFace Features object describing the schema
81
+ splits: List of split names (e.g., ["train", "test"])
82
+ data_dir: Common directory path containing data files
83
+ files: List of all discovered Parquet files
84
+ """
85
+
86
+ repo_id: str
87
+ config: str
88
+ revision: str
89
+ features: Features
90
+ splits: List[str]
91
+ data_dir: str
92
+ files: List[ParquetFile]
93
+
94
+
95
+ def discover_dataset(
96
+ repo_id: str,
97
+ config: str,
98
+ token: Optional[str] = None,
99
+ ) -> DatasetInfo:
100
+ """Discover structure and files in a HuggingFace dataset.
101
+
102
+ Queries the HuggingFace Hub to gather dataset metadata, features, splits,
103
+ and Parquet file information without any Iceberg-specific conversions.
104
+
105
+ Args:
106
+ repo_id: HuggingFace dataset repository ID (e.g., "squad")
107
+ config: Configuration name to discover
108
+ token: HuggingFace API token (uses HF_TOKEN env var if not provided)
109
+
110
+ Returns:
111
+ DatasetInfo with all files for the latest revision
112
+
113
+ Raises:
114
+ ValueError: If dataset not found, config doesn't exist, or metadata inconsistent
115
+ """
116
+ # Step 1: Load dataset builder
117
+ try:
118
+ builder = dataset_builder_safe(repo_id, config=config, token=token)
119
+ except Exception as e:
120
+ raise ValueError(
121
+ f"Dataset {repo_id} config {config} not found or not accessible: {e}"
122
+ ) from e
123
+
124
+ revision = builder.hash
125
+ features = builder.info.features
126
+
127
+ # Step 2: Fetch file metadata from HuggingFace Hub
128
+ api = HfApi(token=token)
129
+ dataset_info = api.dataset_info(repo_id, revision=revision, files_metadata=True)
130
+ # Build mapping from URI to sibling metadata
131
+ file_metadata = {
132
+ f"hf://datasets/{repo_id}@{revision}/{s.rfilename}": s for s in dataset_info.siblings
133
+ }
134
+
135
+ # Step 3: Process data files
136
+ files = []
137
+ for split, file_uris in builder.config.data_files.items():
138
+ for uri in file_uris:
139
+ # Get metadata (strict - fail if not found)
140
+ if uri not in file_metadata:
141
+ raise ValueError(
142
+ f"File {uri} from dataset builder not found in Hub API response. "
143
+ f"This may indicate an inconsistent dataset state."
144
+ )
145
+
146
+ metadata = file_metadata[uri]
147
+
148
+ # Create ParquetFile
149
+ files.append(
150
+ ParquetFile(
151
+ uri=uri,
152
+ path=metadata.rfilename,
153
+ size=metadata.size,
154
+ blob_id=metadata.blob_id,
155
+ split=split,
156
+ )
157
+ )
158
+
159
+ # Step 4: Extract common data directory
160
+ if files:
161
+ try:
162
+ file_dirs = [os.path.dirname(f.path) for f in files]
163
+ data_dir = os.path.commonpath(file_dirs) if file_dirs else ""
164
+ except ValueError as e:
165
+ file_paths = [f.path for f in files]
166
+ raise ValueError(
167
+ f"Unable to determine common data directory from files: {file_paths}"
168
+ ) from e
169
+ else:
170
+ data_dir = ""
171
+
172
+ # Step 5: Return DatasetInfo
173
+ return DatasetInfo(
174
+ repo_id=repo_id,
175
+ config=config,
176
+ revision=revision,
177
+ features=features,
178
+ splits=list(builder.config.data_files.keys()),
179
+ data_dir=data_dir,
180
+ files=files,
181
+ )