faceberg 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faceberg/_version.py +34 -0
- faceberg/catalog.py +92 -76
- faceberg/discover.py +181 -0
- faceberg/iceberg.py +707 -0
- faceberg/tests/test_catalog.py +1 -2
- faceberg/tests/test_discover.py +257 -0
- faceberg/tests/test_iceberg.py +911 -0
- faceberg-0.1.2.dist-info/METADATA +149 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/RECORD +12 -11
- faceberg/bridge.py +0 -586
- faceberg/convert.py +0 -813
- faceberg/tests/test_bridge.py +0 -825
- faceberg/tests/test_convert.py +0 -422
- faceberg-0.1.0.dist-info/METADATA +0 -175
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/WHEEL +0 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/entry_points.txt +0 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/licenses/LICENSE +0 -0
faceberg/_version.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.1.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 2)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
faceberg/catalog.py
CHANGED
|
@@ -4,7 +4,6 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
import shutil
|
|
6
6
|
import tempfile
|
|
7
|
-
import uuid
|
|
8
7
|
from contextlib import contextmanager
|
|
9
8
|
from pathlib import Path
|
|
10
9
|
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Set, Union
|
|
@@ -20,7 +19,7 @@ from pyiceberg.exceptions import (
|
|
|
20
19
|
NoSuchTableError,
|
|
21
20
|
TableAlreadyExistsError,
|
|
22
21
|
)
|
|
23
|
-
from pyiceberg.io import FileIO
|
|
22
|
+
from pyiceberg.io import FileIO, load_file_io
|
|
24
23
|
from pyiceberg.io.fsspec import FsspecFileIO
|
|
25
24
|
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionKey, PartitionSpec
|
|
26
25
|
from pyiceberg.schema import Schema
|
|
@@ -34,8 +33,8 @@ from pyiceberg.typedef import EMPTY_DICT, Properties
|
|
|
34
33
|
from uuid_utils import uuid7
|
|
35
34
|
|
|
36
35
|
from . import config as cfg
|
|
37
|
-
from .
|
|
38
|
-
from .
|
|
36
|
+
from .discover import discover_dataset
|
|
37
|
+
from .iceberg import write_snapshot
|
|
39
38
|
|
|
40
39
|
if TYPE_CHECKING:
|
|
41
40
|
import pyarrow as pa
|
|
@@ -361,8 +360,6 @@ class BaseCatalog(Catalog):
|
|
|
361
360
|
Returns:
|
|
362
361
|
FileIO instance with authentication configured
|
|
363
362
|
"""
|
|
364
|
-
from pyiceberg.io import load_file_io
|
|
365
|
-
|
|
366
363
|
# Start with catalog's persisted properties
|
|
367
364
|
props = dict(self.properties)
|
|
368
365
|
# Add runtime-only token if available
|
|
@@ -956,72 +953,82 @@ class BaseCatalog(Catalog):
|
|
|
956
953
|
identifier, state="in_progress", percent=0, stage="Discovering dataset"
|
|
957
954
|
)
|
|
958
955
|
|
|
959
|
-
dataset_info =
|
|
956
|
+
dataset_info = discover_dataset(
|
|
960
957
|
repo_id=repo,
|
|
961
958
|
config=config,
|
|
962
959
|
token=self._hf_token,
|
|
963
960
|
)
|
|
964
961
|
|
|
965
|
-
#
|
|
962
|
+
# Prepare schema with split column
|
|
966
963
|
if progress_callback:
|
|
967
|
-
progress_callback(
|
|
964
|
+
progress_callback(
|
|
965
|
+
identifier, state="in_progress", percent=10, stage="Converting schema"
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
if not dataset_info.files:
|
|
969
|
+
raise ValueError(f"No Parquet files found in dataset {repo}")
|
|
968
970
|
|
|
969
|
-
#
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
971
|
+
# Convert HuggingFace features to Arrow schema
|
|
972
|
+
arrow_schema = dataset_info.features.arrow_schema
|
|
973
|
+
|
|
974
|
+
# Build table properties
|
|
975
|
+
data_path = (
|
|
976
|
+
f"hf://datasets/{repo}/{dataset_info.data_dir}"
|
|
977
|
+
if dataset_info.data_dir
|
|
978
|
+
else f"hf://datasets/{repo}"
|
|
974
979
|
)
|
|
975
980
|
|
|
976
|
-
|
|
981
|
+
properties = {
|
|
982
|
+
"format-version": "2",
|
|
983
|
+
"write.parquet.compression-codec": "snappy",
|
|
984
|
+
"write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
|
|
985
|
+
"write.data.path": data_path,
|
|
986
|
+
"hf.dataset.repo": repo,
|
|
987
|
+
"hf.dataset.config": config,
|
|
988
|
+
"hf.dataset.revision": dataset_info.revision,
|
|
989
|
+
"hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
|
|
990
|
+
"hf.write.split": "train",
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
# Write Iceberg metadata
|
|
977
994
|
if progress_callback:
|
|
978
995
|
progress_callback(
|
|
979
|
-
identifier, state="in_progress", percent=
|
|
996
|
+
identifier, state="in_progress", percent=20, stage="Writing Iceberg metadata"
|
|
980
997
|
)
|
|
981
998
|
|
|
982
999
|
with self._staging() as staging:
|
|
983
|
-
# Define table directory in the staging area
|
|
984
|
-
# Note: IcebergMetadataWriter will create the metadata subdirectory
|
|
985
|
-
table_dir = staging / identifier.path
|
|
986
|
-
table_dir.mkdir(parents=True, exist_ok=True)
|
|
987
|
-
|
|
988
1000
|
# Create table URI for metadata
|
|
989
1001
|
table_uri = self.uri / identifier.path
|
|
990
1002
|
|
|
991
|
-
#
|
|
992
|
-
|
|
993
|
-
table_path=table_dir,
|
|
994
|
-
schema=table_info.schema,
|
|
995
|
-
partition_spec=table_info.partition_spec,
|
|
996
|
-
base_uri=table_uri,
|
|
997
|
-
)
|
|
1003
|
+
# Load FileIO with HuggingFace support
|
|
1004
|
+
io = self._load_file_io(location=str(table_uri))
|
|
998
1005
|
|
|
999
|
-
#
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
properties=
|
|
1007
|
-
|
|
1008
|
-
|
|
1006
|
+
# Write snapshot metadata with split column
|
|
1007
|
+
write_snapshot(
|
|
1008
|
+
files=dataset_info.files,
|
|
1009
|
+
schema=arrow_schema,
|
|
1010
|
+
current_metadata=None,
|
|
1011
|
+
output_dir=staging / identifier.path,
|
|
1012
|
+
base_uri=str(table_uri),
|
|
1013
|
+
properties=properties,
|
|
1014
|
+
include_split_column=True,
|
|
1015
|
+
io=io,
|
|
1009
1016
|
)
|
|
1010
1017
|
|
|
1011
|
-
#
|
|
1012
|
-
# Record all created files in the table directory
|
|
1018
|
+
# Record all created files in the table metadata directory
|
|
1013
1019
|
if progress_callback:
|
|
1014
1020
|
progress_callback(identifier, state="in_progress", percent=90, stage="Finalizing")
|
|
1015
1021
|
|
|
1016
|
-
|
|
1022
|
+
metadata_dir = staging / identifier.path / "metadata"
|
|
1023
|
+
for path in metadata_dir.rglob("*"):
|
|
1017
1024
|
if path.is_file():
|
|
1018
1025
|
staging.add(path.relative_to(staging.path))
|
|
1019
1026
|
|
|
1020
1027
|
# Register table in config if not already there
|
|
1021
1028
|
if identifier not in catalog_config:
|
|
1022
1029
|
catalog_config[identifier] = cfg.Dataset(
|
|
1023
|
-
repo=
|
|
1024
|
-
config=
|
|
1030
|
+
repo=repo,
|
|
1031
|
+
config=config,
|
|
1025
1032
|
)
|
|
1026
1033
|
# Save config since we added a dataset table
|
|
1027
1034
|
catalog_config.to_yaml(staging / "faceberg.yml")
|
|
@@ -1109,16 +1116,17 @@ class BaseCatalog(Catalog):
|
|
|
1109
1116
|
"Please recreate the table to enable incremental sync."
|
|
1110
1117
|
)
|
|
1111
1118
|
|
|
1112
|
-
# Discover dataset at current revision
|
|
1113
|
-
|
|
1119
|
+
# Discover dataset at current revision
|
|
1120
|
+
# Note: The new discover_dataset() doesn't support since_revision filtering yet
|
|
1121
|
+
# So we discover all files and write_snapshot() will handle the diff
|
|
1122
|
+
dataset_info = discover_dataset(
|
|
1114
1123
|
repo_id=table_entry.repo,
|
|
1115
1124
|
config=table_entry.config,
|
|
1116
1125
|
token=self._hf_token,
|
|
1117
|
-
since_revision=old_revision,
|
|
1118
1126
|
)
|
|
1119
1127
|
|
|
1120
|
-
# Check if already up to date (
|
|
1121
|
-
if
|
|
1128
|
+
# Check if already up to date (same revision)
|
|
1129
|
+
if dataset_info.revision == old_revision:
|
|
1122
1130
|
logger.info(f"Table {identifier} already at revision {old_revision}")
|
|
1123
1131
|
if progress_callback:
|
|
1124
1132
|
progress_callback(
|
|
@@ -1126,43 +1134,51 @@ class BaseCatalog(Catalog):
|
|
|
1126
1134
|
)
|
|
1127
1135
|
return table
|
|
1128
1136
|
|
|
1129
|
-
#
|
|
1130
|
-
#
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1137
|
+
# Use existing table schema - don't modify it
|
|
1138
|
+
# The schema was already set correctly when the table was created
|
|
1139
|
+
|
|
1140
|
+
# Build updated properties
|
|
1141
|
+
data_path = (
|
|
1142
|
+
f"hf://datasets/{table_entry.repo}/{dataset_info.data_dir}"
|
|
1143
|
+
if dataset_info.data_dir
|
|
1144
|
+
else f"hf://datasets/{table_entry.repo}"
|
|
1134
1145
|
)
|
|
1135
1146
|
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1147
|
+
properties = {
|
|
1148
|
+
"format-version": "2",
|
|
1149
|
+
"write.parquet.compression-codec": "snappy",
|
|
1150
|
+
"write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
|
|
1151
|
+
"write.data.path": data_path,
|
|
1152
|
+
"hf.dataset.repo": table_entry.repo,
|
|
1153
|
+
"hf.dataset.config": table_entry.config,
|
|
1154
|
+
"hf.dataset.revision": dataset_info.revision,
|
|
1155
|
+
"hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
|
|
1156
|
+
"hf.write.split": "train",
|
|
1157
|
+
}
|
|
1140
1158
|
|
|
1141
|
-
# Append new snapshot with
|
|
1159
|
+
# Append new snapshot with all files (write_snapshot will handle diffing)
|
|
1142
1160
|
with self._staging() as staging:
|
|
1143
|
-
# Create local metadata directory
|
|
1144
|
-
metadata_dir = staging / identifier.path / "metadata"
|
|
1145
|
-
metadata_dir.mkdir(parents=True, exist_ok=True)
|
|
1146
|
-
|
|
1147
1161
|
# Create table URI for metadata
|
|
1148
|
-
table_uri = self.uri / identifier.path
|
|
1149
|
-
|
|
1150
|
-
# Create metadata writer
|
|
1151
|
-
metadata_writer = IcebergMetadataWriter(
|
|
1152
|
-
table_path=metadata_dir,
|
|
1153
|
-
schema=table_info.schema,
|
|
1154
|
-
partition_spec=table_info.partition_spec,
|
|
1155
|
-
base_uri=table_uri,
|
|
1156
|
-
)
|
|
1162
|
+
table_uri = self.uri / identifier.path
|
|
1157
1163
|
|
|
1158
|
-
#
|
|
1159
|
-
|
|
1160
|
-
|
|
1164
|
+
# Load FileIO with HuggingFace support
|
|
1165
|
+
io = self._load_file_io(location=str(table_uri))
|
|
1166
|
+
|
|
1167
|
+
# Write new snapshot (will diff against current_metadata)
|
|
1168
|
+
# Schema and include_split_column parameters are ignored when current_metadata exists
|
|
1169
|
+
# - it uses current_metadata.schema() and current_metadata.spec()
|
|
1170
|
+
write_snapshot(
|
|
1171
|
+
files=dataset_info.files,
|
|
1172
|
+
schema=dataset_info.features.arrow_schema, # Only used if creating new table
|
|
1161
1173
|
current_metadata=table.metadata,
|
|
1162
|
-
|
|
1174
|
+
output_dir=staging / identifier.path,
|
|
1175
|
+
base_uri=str(table_uri),
|
|
1176
|
+
properties=properties,
|
|
1177
|
+
io=io,
|
|
1163
1178
|
)
|
|
1164
1179
|
|
|
1165
|
-
# Record all files in the
|
|
1180
|
+
# Record all files in the metadata directory (including new manifest/metadata files)
|
|
1181
|
+
metadata_dir = staging / identifier.path / "metadata"
|
|
1166
1182
|
for path in metadata_dir.rglob("*"):
|
|
1167
1183
|
if path.is_file():
|
|
1168
1184
|
staging.add(path.relative_to(staging.path))
|
faceberg/discover.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""HuggingFace dataset discovery.
|
|
2
|
+
|
|
3
|
+
This module discovers HuggingFace datasets and extracts metadata without
|
|
4
|
+
any Iceberg-specific conversions. It provides the foundation for converting
|
|
5
|
+
datasets to Iceberg tables.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import tempfile
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from datasets import Features, load_dataset_builder
|
|
14
|
+
from huggingface_hub import HfApi
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def dataset_builder_safe(
|
|
18
|
+
repo_id: str,
|
|
19
|
+
config: str,
|
|
20
|
+
token: Optional[str] = None,
|
|
21
|
+
):
|
|
22
|
+
"""Load dataset builder while avoiding picking up local files.
|
|
23
|
+
|
|
24
|
+
Changes to a temporary directory before loading to ensure the datasets
|
|
25
|
+
library doesn't pick up local files in the current directory.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
repo_id: HuggingFace dataset repository ID
|
|
29
|
+
config: Configuration name
|
|
30
|
+
token: Optional HuggingFace API token
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Dataset builder object
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
Exception: If loading fails
|
|
37
|
+
"""
|
|
38
|
+
original_cwd = os.getcwd()
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
# Change to a temporary directory to avoid dataset library picking up local files
|
|
42
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
43
|
+
os.chdir(tmpdir)
|
|
44
|
+
return load_dataset_builder(repo_id, config, token=token)
|
|
45
|
+
finally:
|
|
46
|
+
# Always restore the original directory
|
|
47
|
+
os.chdir(original_cwd)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class ParquetFile:
|
|
52
|
+
"""A Parquet file discovered in a HuggingFace dataset.
|
|
53
|
+
|
|
54
|
+
Attributes:
|
|
55
|
+
uri: Full hf:// URI with revision (e.g., hf://datasets/repo@sha/file.parquet)
|
|
56
|
+
path: File path within the dataset (e.g., data/train-00000.parquet)
|
|
57
|
+
size: File size in bytes
|
|
58
|
+
blob_id: Git blob ID (oid) from HuggingFace
|
|
59
|
+
split: Optional split name (train, test, validation, etc.)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
uri: str
|
|
63
|
+
path: str
|
|
64
|
+
size: int
|
|
65
|
+
blob_id: str
|
|
66
|
+
split: Optional[str] = None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class DatasetInfo:
|
|
71
|
+
"""Complete information about a discovered HuggingFace dataset.
|
|
72
|
+
|
|
73
|
+
This represents the result of dataset discovery, containing all metadata
|
|
74
|
+
needed to understand the dataset structure without any Iceberg conversions.
|
|
75
|
+
|
|
76
|
+
Attributes:
|
|
77
|
+
repo_id: HuggingFace repository ID (e.g., "squad")
|
|
78
|
+
config: Configuration name
|
|
79
|
+
revision: Git revision SHA
|
|
80
|
+
features: HuggingFace Features object describing the schema
|
|
81
|
+
splits: List of split names (e.g., ["train", "test"])
|
|
82
|
+
data_dir: Common directory path containing data files
|
|
83
|
+
files: List of all discovered Parquet files
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
repo_id: str
|
|
87
|
+
config: str
|
|
88
|
+
revision: str
|
|
89
|
+
features: Features
|
|
90
|
+
splits: List[str]
|
|
91
|
+
data_dir: str
|
|
92
|
+
files: List[ParquetFile]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def discover_dataset(
|
|
96
|
+
repo_id: str,
|
|
97
|
+
config: str,
|
|
98
|
+
token: Optional[str] = None,
|
|
99
|
+
) -> DatasetInfo:
|
|
100
|
+
"""Discover structure and files in a HuggingFace dataset.
|
|
101
|
+
|
|
102
|
+
Queries the HuggingFace Hub to gather dataset metadata, features, splits,
|
|
103
|
+
and Parquet file information without any Iceberg-specific conversions.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
repo_id: HuggingFace dataset repository ID (e.g., "squad")
|
|
107
|
+
config: Configuration name to discover
|
|
108
|
+
token: HuggingFace API token (uses HF_TOKEN env var if not provided)
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
DatasetInfo with all files for the latest revision
|
|
112
|
+
|
|
113
|
+
Raises:
|
|
114
|
+
ValueError: If dataset not found, config doesn't exist, or metadata inconsistent
|
|
115
|
+
"""
|
|
116
|
+
# Step 1: Load dataset builder
|
|
117
|
+
try:
|
|
118
|
+
builder = dataset_builder_safe(repo_id, config=config, token=token)
|
|
119
|
+
except Exception as e:
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"Dataset {repo_id} config {config} not found or not accessible: {e}"
|
|
122
|
+
) from e
|
|
123
|
+
|
|
124
|
+
revision = builder.hash
|
|
125
|
+
features = builder.info.features
|
|
126
|
+
|
|
127
|
+
# Step 2: Fetch file metadata from HuggingFace Hub
|
|
128
|
+
api = HfApi(token=token)
|
|
129
|
+
dataset_info = api.dataset_info(repo_id, revision=revision, files_metadata=True)
|
|
130
|
+
# Build mapping from URI to sibling metadata
|
|
131
|
+
file_metadata = {
|
|
132
|
+
f"hf://datasets/{repo_id}@{revision}/{s.rfilename}": s for s in dataset_info.siblings
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
# Step 3: Process data files
|
|
136
|
+
files = []
|
|
137
|
+
for split, file_uris in builder.config.data_files.items():
|
|
138
|
+
for uri in file_uris:
|
|
139
|
+
# Get metadata (strict - fail if not found)
|
|
140
|
+
if uri not in file_metadata:
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"File {uri} from dataset builder not found in Hub API response. "
|
|
143
|
+
f"This may indicate an inconsistent dataset state."
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
metadata = file_metadata[uri]
|
|
147
|
+
|
|
148
|
+
# Create ParquetFile
|
|
149
|
+
files.append(
|
|
150
|
+
ParquetFile(
|
|
151
|
+
uri=uri,
|
|
152
|
+
path=metadata.rfilename,
|
|
153
|
+
size=metadata.size,
|
|
154
|
+
blob_id=metadata.blob_id,
|
|
155
|
+
split=split,
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Step 4: Extract common data directory
|
|
160
|
+
if files:
|
|
161
|
+
try:
|
|
162
|
+
file_dirs = [os.path.dirname(f.path) for f in files]
|
|
163
|
+
data_dir = os.path.commonpath(file_dirs) if file_dirs else ""
|
|
164
|
+
except ValueError as e:
|
|
165
|
+
file_paths = [f.path for f in files]
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"Unable to determine common data directory from files: {file_paths}"
|
|
168
|
+
) from e
|
|
169
|
+
else:
|
|
170
|
+
data_dir = ""
|
|
171
|
+
|
|
172
|
+
# Step 5: Return DatasetInfo
|
|
173
|
+
return DatasetInfo(
|
|
174
|
+
repo_id=repo_id,
|
|
175
|
+
config=config,
|
|
176
|
+
revision=revision,
|
|
177
|
+
features=features,
|
|
178
|
+
splits=list(builder.config.data_files.keys()),
|
|
179
|
+
data_dir=data_dir,
|
|
180
|
+
files=files,
|
|
181
|
+
)
|