faceberg 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faceberg/_version.py +34 -0
- faceberg/catalog.py +92 -76
- faceberg/discover.py +181 -0
- faceberg/iceberg.py +707 -0
- faceberg/tests/test_catalog.py +1 -2
- faceberg/tests/test_discover.py +257 -0
- faceberg/tests/test_iceberg.py +911 -0
- faceberg/tests/test_server_playwright.py +5 -1
- {faceberg-0.1.1.dist-info → faceberg-0.1.3.dist-info}/METADATA +9 -7
- {faceberg-0.1.1.dist-info → faceberg-0.1.3.dist-info}/RECORD +13 -12
- faceberg/bridge.py +0 -586
- faceberg/convert.py +0 -813
- faceberg/tests/test_bridge.py +0 -825
- faceberg/tests/test_convert.py +0 -422
- {faceberg-0.1.1.dist-info → faceberg-0.1.3.dist-info}/WHEEL +0 -0
- {faceberg-0.1.1.dist-info → faceberg-0.1.3.dist-info}/entry_points.txt +0 -0
- {faceberg-0.1.1.dist-info → faceberg-0.1.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,7 +4,11 @@ import re
|
|
|
4
4
|
import time
|
|
5
5
|
|
|
6
6
|
import pytest
|
|
7
|
-
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from playwright.sync_api import Page, expect
|
|
10
|
+
except ImportError:
|
|
11
|
+
playwright = pytest.importorskip("playwright", reason="Playwright not installed")
|
|
8
12
|
|
|
9
13
|
|
|
10
14
|
@pytest.fixture(scope="session")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: faceberg
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Bridge HuggingFace datasets with Apache Iceberg
|
|
5
5
|
Project-URL: Homepage, https://github.com/kszucs/faceberg
|
|
6
6
|
Project-URL: Documentation, https://github.com/kszucs/faceberg
|
|
@@ -25,7 +25,7 @@ Requires-Dist: huggingface-hub>=0.20.0
|
|
|
25
25
|
Requires-Dist: jinja2>=3.1.6
|
|
26
26
|
Requires-Dist: litestar>=2.0.0
|
|
27
27
|
Requires-Dist: pyarrow>=21.0.0
|
|
28
|
-
Requires-Dist: pyiceberg>=0.
|
|
28
|
+
Requires-Dist: pyiceberg>=0.10.0
|
|
29
29
|
Requires-Dist: pyyaml>=6.0
|
|
30
30
|
Requires-Dist: rich>=13.0.0
|
|
31
31
|
Requires-Dist: uuid-utils>=0.9.0
|
|
@@ -82,7 +82,7 @@ LIMIT 10;
|
|
|
82
82
|
```
|
|
83
83
|
HuggingFace Hub
|
|
84
84
|
┌─────────────────────────────────────────────────────────┐
|
|
85
|
-
│
|
|
85
|
+
│ │
|
|
86
86
|
│ ┌─────────────────────┐ ┌─────────────────────────┐ │
|
|
87
87
|
│ │ HF Datasets │ │ HF Spaces (Catalog) │ │
|
|
88
88
|
│ │ (Original Parquet) │◄───│ • Iceberg metadata │ │
|
|
@@ -129,10 +129,12 @@ result = conn.execute("SELECT * FROM cat.stanfordnlp.imdb LIMIT 5").fetchdf()
|
|
|
129
129
|
|
|
130
130
|
## Documentation
|
|
131
131
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
- [
|
|
135
|
-
- [
|
|
132
|
+
**[Read the docs →](https://faceberg.kszucs.dev/)**
|
|
133
|
+
|
|
134
|
+
- [Getting Started](https://faceberg.kszucs.dev/) — Full quickstart guide
|
|
135
|
+
- [Local Catalogs](https://faceberg.kszucs.dev/local.html) — Use local catalogs for development
|
|
136
|
+
- [DuckDB Integration](https://faceberg.kszucs.dev/integrations/duckdb.html) — Advanced SQL queries
|
|
137
|
+
- [Pandas Integration](https://faceberg.kszucs.dev/integrations/pandas.html) — Load into DataFrames
|
|
136
138
|
|
|
137
139
|
## Development
|
|
138
140
|
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
faceberg/__init__.py,sha256=F3fztSclzDN7_ItHopPIChWb4YwnEs5DKZ_ckslTebY,354
|
|
2
|
-
faceberg/
|
|
3
|
-
faceberg/catalog.py,sha256=
|
|
2
|
+
faceberg/_version.py,sha256=q5nF98G8SoVeJqaknL0xdyxtv0egsqb0fK06_84Izu8,704
|
|
3
|
+
faceberg/catalog.py,sha256=Jr_tOrPn7967VJvSRUIHLVjdwVAdOyFre6mldyfhxvk,55283
|
|
4
4
|
faceberg/cli.py,sha256=BNwv8kZ__3c8vLSOPvWLKSsrxvNsMNDfPlkMyDHLsLk,17123
|
|
5
5
|
faceberg/config.py,sha256=SjEfipfT38trjgmTA1abpY--5AOnYlt_QM8KihDUxJY,5912
|
|
6
|
-
faceberg/
|
|
6
|
+
faceberg/discover.py,sha256=HyENObguao6bwF1eFmLd1-eo8FT9_FdaVZ3-8EQ_0XU,5575
|
|
7
|
+
faceberg/iceberg.py,sha256=_ZLQGnlCKmHoVEYVpKOMVnjufja7o-2PqJc3JI116BY,27520
|
|
7
8
|
faceberg/pretty.py,sha256=PUmQbv0HJDU5Q-psR4gTT1ON62NoIYWzS2JJy5_o-pY,6806
|
|
8
9
|
faceberg/server.py,sha256=tc_ULXyQy-5KEtLVyETjecNQajoaFGCz_aw_-rzzyOY,15369
|
|
9
10
|
faceberg/shell.py,sha256=wa5r06VhrAq0I4gc3Qrl8HvSY9we0wBFxtPp6QPL2dk,2320
|
|
@@ -12,18 +13,18 @@ faceberg/spaces/README.md,sha256=mM7M7_MKI_673DCR1HpRVvb6sYK6ohtDYos5ykCNY2s,258
|
|
|
12
13
|
faceberg/spaces/landing.html,sha256=I1Oadpg58VTuWbQBSuqqA1-g1dZOKP5vCXWrxxrEV5k,27333
|
|
13
14
|
faceberg/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
15
|
faceberg/tests/conftest.py,sha256=rwvBIlMDhD1USNKrt65Qn5PYefX0TTyBtIPDTtaetYA,7547
|
|
15
|
-
faceberg/tests/
|
|
16
|
-
faceberg/tests/test_catalog.py,sha256=CBhFnyASQTH1sqFH_Z_RFD74tzmzi7lvXQaSKnXA1eI,49948
|
|
16
|
+
faceberg/tests/test_catalog.py,sha256=YFrSOR4ZNZ3WifFFmLDnD5evJXN2PC-4uUnVzc2b22A,49939
|
|
17
17
|
faceberg/tests/test_catalog_duckdb.py,sha256=9-nz40sbM7xmYPHa78Y5_Ag6hrpu6o--zCyUc9Fqu4w,10392
|
|
18
18
|
faceberg/tests/test_catalog_pandas.py,sha256=jOnRb_pzsTUcgaQJsm49zbw2zaxUFRvjGq-Ub_q-vBs,9854
|
|
19
19
|
faceberg/tests/test_cli.py,sha256=CTsF9SPSpU5PHdBGOQ-GAZjXNbgPy4TdNDJj0Gm6uAM,1828
|
|
20
20
|
faceberg/tests/test_config.py,sha256=gg7xaOIhTCpPryw8SpcYgPJSoEWvy3CKRp3cEg0EF-4,12040
|
|
21
|
-
faceberg/tests/
|
|
21
|
+
faceberg/tests/test_discover.py,sha256=0xDjf_E-SNdd2ysvj8L-xTN2eFm31tRBFPxk04lfDCQ,9076
|
|
22
|
+
faceberg/tests/test_iceberg.py,sha256=t_nQhQ_Jkpj_NGIE3vf4Uhzu7rJ8eqbKRoiUEXzTp4g,31112
|
|
22
23
|
faceberg/tests/test_pretty.py,sha256=o70FhUTdqWtXQI6AKVuOxtPghFvD5OV8UnG_64wSqnY,11520
|
|
23
24
|
faceberg/tests/test_server.py,sha256=MWwRCloR3R-YWOzn95eNAv8cR3gGBg9dk65skz5eLdY,13405
|
|
24
|
-
faceberg/tests/test_server_playwright.py,sha256
|
|
25
|
-
faceberg-0.1.
|
|
26
|
-
faceberg-0.1.
|
|
27
|
-
faceberg-0.1.
|
|
28
|
-
faceberg-0.1.
|
|
29
|
-
faceberg-0.1.
|
|
25
|
+
faceberg/tests/test_server_playwright.py,sha256=BZJBNaeaLQI6NQSaydEXwS7soop7OR8YwFYRJKrrXkE,17671
|
|
26
|
+
faceberg-0.1.3.dist-info/METADATA,sha256=d37CM7x6aKYUd3RCR3wU9wYb919XFABFBrAWOVHfbRE,5532
|
|
27
|
+
faceberg-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
28
|
+
faceberg-0.1.3.dist-info/entry_points.txt,sha256=FPgDHoQRBWU0Wp_i2s4_T6dOqzKovH17g4O4oYRtKcI,47
|
|
29
|
+
faceberg-0.1.3.dist-info/licenses/LICENSE,sha256=DLb11Qr5b1cU8I9DJ9Sl9vNU3m_yqyMnmKjR15tNbt0,11345
|
|
30
|
+
faceberg-0.1.3.dist-info/RECORD,,
|
faceberg/bridge.py
DELETED
|
@@ -1,586 +0,0 @@
|
|
|
1
|
-
"""Bridge between HuggingFace datasets and Apache Iceberg tables.
|
|
2
|
-
|
|
3
|
-
This module discovers HuggingFace datasets and converts them to TableInfo objects
|
|
4
|
-
that contain all the Iceberg metadata needed for table creation.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
import tempfile
|
|
10
|
-
from dataclasses import dataclass
|
|
11
|
-
from typing import Dict, List, Optional, Tuple
|
|
12
|
-
|
|
13
|
-
from datasets import (
|
|
14
|
-
Features,
|
|
15
|
-
load_dataset_builder,
|
|
16
|
-
)
|
|
17
|
-
from huggingface_hub import HfApi, HfFileSystem
|
|
18
|
-
from pyiceberg.io.pyarrow import _pyarrow_to_schema_without_ids
|
|
19
|
-
from pyiceberg.partitioning import PartitionField, PartitionSpec
|
|
20
|
-
from pyiceberg.schema import Schema, assign_fresh_schema_ids
|
|
21
|
-
from pyiceberg.transforms import IdentityTransform
|
|
22
|
-
from pyiceberg.types import ListType, MapType, NestedField, StringType, StructType
|
|
23
|
-
|
|
24
|
-
# =============================================================================
|
|
25
|
-
# Bridge Output Classes
|
|
26
|
-
# =============================================================================
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@dataclass
|
|
30
|
-
class FileInfo:
|
|
31
|
-
"""Information about a data file in Iceberg table."""
|
|
32
|
-
|
|
33
|
-
uri: str # Full hf:// URI to the file
|
|
34
|
-
split: Optional[str] = None # Split name (train, test, validation, etc.)
|
|
35
|
-
size_bytes: Optional[int] = None # File size in bytes (enriched later)
|
|
36
|
-
row_count: Optional[int] = None # Number of rows in the file (enriched later)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
@dataclass
|
|
40
|
-
class TableInfo:
|
|
41
|
-
"""Complete information needed to create an Iceberg table.
|
|
42
|
-
|
|
43
|
-
This class serves as the output of the bridge layer, containing all the
|
|
44
|
-
metadata needed to convert a HuggingFace dataset into an Iceberg table.
|
|
45
|
-
"""
|
|
46
|
-
|
|
47
|
-
# Table identity
|
|
48
|
-
namespace: str # Iceberg namespace (e.g., "default")
|
|
49
|
-
table_name: str # Table name (e.g., "squad_plain_text")
|
|
50
|
-
|
|
51
|
-
# Iceberg schema and partitioning
|
|
52
|
-
schema: Schema # Iceberg schema with field IDs
|
|
53
|
-
partition_spec: PartitionSpec # Partition specification
|
|
54
|
-
|
|
55
|
-
# Data files
|
|
56
|
-
data_files: List[FileInfo] # List of data files with metadata
|
|
57
|
-
data_dir: str # Data directory path relative to repo root
|
|
58
|
-
|
|
59
|
-
# Source metadata (for traceability)
|
|
60
|
-
dataset_repo: str # HuggingFace repo ID
|
|
61
|
-
dataset_config: str # Dataset configuration name
|
|
62
|
-
dataset_revision: str # Git revision/SHA of the dataset
|
|
63
|
-
|
|
64
|
-
@property
|
|
65
|
-
def identifier(self) -> str:
|
|
66
|
-
"""Get table identifier in 'namespace.table_name' format."""
|
|
67
|
-
return f"{self.namespace}.{self.table_name}"
|
|
68
|
-
|
|
69
|
-
@property
|
|
70
|
-
def total_rows(self) -> int:
|
|
71
|
-
"""Get total row count across all files."""
|
|
72
|
-
return sum(f.row_count for f in self.data_files if f.row_count is not None)
|
|
73
|
-
|
|
74
|
-
@property
|
|
75
|
-
def total_size(self) -> int:
|
|
76
|
-
"""Get total size in bytes across all files."""
|
|
77
|
-
return sum(f.size_bytes for f in self.data_files if f.size_bytes is not None)
|
|
78
|
-
|
|
79
|
-
def get_table_properties(self) -> Dict[str, str]:
|
|
80
|
-
"""Get table properties for Iceberg metadata.
|
|
81
|
-
|
|
82
|
-
Returns:
|
|
83
|
-
Dictionary of table properties including source metadata and name mapping
|
|
84
|
-
"""
|
|
85
|
-
# Create schema name mapping for Parquet files without embedded field IDs
|
|
86
|
-
name_mapping = iceberg_name_mapping(self.schema)
|
|
87
|
-
|
|
88
|
-
# Use data directory from discovery
|
|
89
|
-
data_path = (
|
|
90
|
-
f"hf://datasets/{self.dataset_repo}/{self.data_dir}"
|
|
91
|
-
if self.data_dir
|
|
92
|
-
else f"hf://datasets/{self.dataset_repo}"
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
# TODO(kszucs): split should be configurable
|
|
96
|
-
properties = {
|
|
97
|
-
"format-version": "3",
|
|
98
|
-
"write.parquet.compression-codec": "snappy",
|
|
99
|
-
"write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
|
|
100
|
-
"write.data.path": data_path,
|
|
101
|
-
# HuggingFace source metadata
|
|
102
|
-
"hf.dataset.repo": self.dataset_repo,
|
|
103
|
-
"hf.dataset.config": self.dataset_config,
|
|
104
|
-
"hf.dataset.revision": self.dataset_revision,
|
|
105
|
-
# Write configuration
|
|
106
|
-
"hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
|
|
107
|
-
"hf.write.split": "train",
|
|
108
|
-
# Schema mapping
|
|
109
|
-
"schema.name-mapping.default": json.dumps(name_mapping),
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
return properties
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# =============================================================================
|
|
116
|
-
# Iceberg Helpers (Schema and Metadata)
|
|
117
|
-
# =============================================================================
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def iceberg_field_mapping(field: NestedField) -> Dict[str, any]:
|
|
121
|
-
"""Build name mapping for a single field, recursively handling nested types.
|
|
122
|
-
|
|
123
|
-
Args:
|
|
124
|
-
field: Iceberg NestedField to create mapping for
|
|
125
|
-
|
|
126
|
-
Returns:
|
|
127
|
-
Dictionary containing field-id, names, and optionally nested fields
|
|
128
|
-
"""
|
|
129
|
-
mapping = {
|
|
130
|
-
"field-id": field.field_id,
|
|
131
|
-
"names": [field.name],
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
# Handle nested types
|
|
135
|
-
if isinstance(field.field_type, StructType):
|
|
136
|
-
# Recursively map nested struct fields
|
|
137
|
-
nested_fields = []
|
|
138
|
-
for nested_field in field.field_type.fields:
|
|
139
|
-
nested_fields.append(iceberg_field_mapping(nested_field))
|
|
140
|
-
if nested_fields:
|
|
141
|
-
mapping["fields"] = nested_fields
|
|
142
|
-
elif isinstance(field.field_type, ListType):
|
|
143
|
-
# Create mapping for the list element
|
|
144
|
-
element_mapping = {
|
|
145
|
-
"field-id": field.field_type.element_id,
|
|
146
|
-
"names": ["element"],
|
|
147
|
-
}
|
|
148
|
-
# If element is a struct, recursively map its fields
|
|
149
|
-
if isinstance(field.field_type.element_type, StructType):
|
|
150
|
-
element_fields = []
|
|
151
|
-
for nested_field in field.field_type.element_type.fields:
|
|
152
|
-
element_fields.append(iceberg_field_mapping(nested_field))
|
|
153
|
-
if element_fields:
|
|
154
|
-
element_mapping["fields"] = element_fields
|
|
155
|
-
mapping["fields"] = [element_mapping]
|
|
156
|
-
elif isinstance(field.field_type, MapType):
|
|
157
|
-
# Create mappings for key and value
|
|
158
|
-
map_fields = []
|
|
159
|
-
|
|
160
|
-
# Map the key
|
|
161
|
-
key_mapping = {
|
|
162
|
-
"field-id": field.field_type.key_id,
|
|
163
|
-
"names": ["key"],
|
|
164
|
-
}
|
|
165
|
-
if isinstance(field.field_type.key_type, StructType):
|
|
166
|
-
key_fields = []
|
|
167
|
-
for nested_field in field.field_type.key_type.fields:
|
|
168
|
-
key_fields.append(iceberg_field_mapping(nested_field))
|
|
169
|
-
if key_fields:
|
|
170
|
-
key_mapping["fields"] = key_fields
|
|
171
|
-
map_fields.append(key_mapping)
|
|
172
|
-
|
|
173
|
-
# Map the value
|
|
174
|
-
value_mapping = {
|
|
175
|
-
"field-id": field.field_type.value_id,
|
|
176
|
-
"names": ["value"],
|
|
177
|
-
}
|
|
178
|
-
if isinstance(field.field_type.value_type, StructType):
|
|
179
|
-
value_fields = []
|
|
180
|
-
for nested_field in field.field_type.value_type.fields:
|
|
181
|
-
value_fields.append(iceberg_field_mapping(nested_field))
|
|
182
|
-
if value_fields:
|
|
183
|
-
value_mapping["fields"] = value_fields
|
|
184
|
-
map_fields.append(value_mapping)
|
|
185
|
-
|
|
186
|
-
mapping["fields"] = map_fields
|
|
187
|
-
|
|
188
|
-
return mapping
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
def iceberg_name_mapping(schema: Schema) -> List[Dict[str, any]]:
|
|
192
|
-
"""Build Iceberg name mapping from schema, recursively handling nested fields.
|
|
193
|
-
|
|
194
|
-
Name mapping is used to map Parquet column names to Iceberg field IDs for
|
|
195
|
-
files that don't have embedded field IDs.
|
|
196
|
-
|
|
197
|
-
Args:
|
|
198
|
-
schema: Iceberg schema to create mapping for
|
|
199
|
-
|
|
200
|
-
Returns:
|
|
201
|
-
List of field mappings with field-id, names, and nested fields
|
|
202
|
-
"""
|
|
203
|
-
fields = []
|
|
204
|
-
for field in schema.fields:
|
|
205
|
-
fields.append(iceberg_field_mapping(field))
|
|
206
|
-
return fields
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
def iceberg_partition_spec(schema: Schema) -> PartitionSpec:
|
|
210
|
-
"""Build a partition spec that uses 'split' as a partition key.
|
|
211
|
-
|
|
212
|
-
This creates an identity partition on the split column, which means the split
|
|
213
|
-
value will be stored in metadata and used for partition pruning.
|
|
214
|
-
|
|
215
|
-
Args:
|
|
216
|
-
schema: Iceberg schema containing a 'split' field
|
|
217
|
-
|
|
218
|
-
Returns:
|
|
219
|
-
PartitionSpec with split as partition key
|
|
220
|
-
|
|
221
|
-
Raises:
|
|
222
|
-
ValueError: If schema doesn't contain a 'split' field
|
|
223
|
-
"""
|
|
224
|
-
split_field = schema.find_field("split")
|
|
225
|
-
if split_field is None:
|
|
226
|
-
raise ValueError("Schema must contain a 'split' field to create split partition spec")
|
|
227
|
-
|
|
228
|
-
return PartitionSpec(
|
|
229
|
-
PartitionField(
|
|
230
|
-
source_id=split_field.field_id,
|
|
231
|
-
field_id=1000, # Partition field IDs start at 1000
|
|
232
|
-
transform=IdentityTransform(),
|
|
233
|
-
name="split",
|
|
234
|
-
),
|
|
235
|
-
spec_id=0,
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
def iceberg_schema_from_features(features, include_split_column: bool = True) -> Schema:
|
|
240
|
-
"""
|
|
241
|
-
Build an Iceberg Schema from HuggingFace dataset features using Arrow as an intermediate format.
|
|
242
|
-
|
|
243
|
-
This approach ensures globally unique field IDs across nested structures by leveraging
|
|
244
|
-
PyIceberg's built-in conversion and ID assignment logic.
|
|
245
|
-
|
|
246
|
-
Args:
|
|
247
|
-
features: HuggingFace Features object or dict of features
|
|
248
|
-
include_split_column: If True, adds a 'split' column to the schema (default: True)
|
|
249
|
-
|
|
250
|
-
Returns:
|
|
251
|
-
PyIceberg Schema object with globally unique field IDs
|
|
252
|
-
"""
|
|
253
|
-
# Convert to Features if dict
|
|
254
|
-
if isinstance(features, dict):
|
|
255
|
-
features = Features(features)
|
|
256
|
-
|
|
257
|
-
# Convert: Features → Arrow Schema → Iceberg Schema (without IDs) → Assign fresh IDs
|
|
258
|
-
# This ensures globally unique field IDs across all nested structures
|
|
259
|
-
arrow_schema = features.arrow_schema
|
|
260
|
-
iceberg_schema_no_ids = _pyarrow_to_schema_without_ids(arrow_schema)
|
|
261
|
-
schema = assign_fresh_schema_ids(iceberg_schema_no_ids)
|
|
262
|
-
|
|
263
|
-
# Add split column as the first field if requested
|
|
264
|
-
if include_split_column:
|
|
265
|
-
# Create split field (will get ID 1 after reassignment)
|
|
266
|
-
# Note: Although the schema uses StringType, the actual Parquet data
|
|
267
|
-
# will use dictionary encoding (int8 indices) for compression efficiency
|
|
268
|
-
# The split column is optional since it doesn't exist in the source Parquet files,
|
|
269
|
-
# it's derived from partition metadata
|
|
270
|
-
split_field = NestedField(
|
|
271
|
-
field_id=-1, # Temporary ID, will be reassigned
|
|
272
|
-
name="split",
|
|
273
|
-
field_type=StringType(),
|
|
274
|
-
required=False,
|
|
275
|
-
)
|
|
276
|
-
# Prepend split field to existing fields
|
|
277
|
-
new_fields = [split_field] + list(schema.fields)
|
|
278
|
-
|
|
279
|
-
# Create new schema and reassign all field IDs globally
|
|
280
|
-
# This ensures field IDs are globally unique across nested structures
|
|
281
|
-
schema_with_split = Schema(*new_fields)
|
|
282
|
-
schema = assign_fresh_schema_ids(schema_with_split)
|
|
283
|
-
|
|
284
|
-
return schema
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
# =============================================================================
|
|
288
|
-
# Dataset Helpers (HuggingFace)
|
|
289
|
-
# =============================================================================
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
def dataset_new_files(
|
|
293
|
-
repo_id: str,
|
|
294
|
-
config: str,
|
|
295
|
-
old_revision: str,
|
|
296
|
-
new_revision: str,
|
|
297
|
-
token: Optional[str] = None,
|
|
298
|
-
) -> List[str]:
|
|
299
|
-
"""Find new parquet files added between two revisions.
|
|
300
|
-
|
|
301
|
-
Uses HuggingFace Hub API to diff two git revisions and identify
|
|
302
|
-
new parquet files for a specific dataset configuration.
|
|
303
|
-
|
|
304
|
-
Args:
|
|
305
|
-
repo_id: HuggingFace dataset repo ID (e.g., "squad")
|
|
306
|
-
config: Dataset configuration name (e.g., "plain_text")
|
|
307
|
-
old_revision: Previous commit SHA
|
|
308
|
-
new_revision: Current commit SHA or branch (usually "main")
|
|
309
|
-
token: HuggingFace API token
|
|
310
|
-
|
|
311
|
-
Returns:
|
|
312
|
-
List of path_in_repo strings for new parquet files
|
|
313
|
-
|
|
314
|
-
Example:
|
|
315
|
-
>>> dataset_new_files(
|
|
316
|
-
... "squad",
|
|
317
|
-
... "plain_text",
|
|
318
|
-
... "abc123",
|
|
319
|
-
... "def456"
|
|
320
|
-
... )
|
|
321
|
-
['plain_text/train-00000-of-00001.parquet', 'plain_text/validation-00000-of-00001.parquet']
|
|
322
|
-
"""
|
|
323
|
-
api = HfApi(token=token)
|
|
324
|
-
|
|
325
|
-
# Get all files at old revision
|
|
326
|
-
old_files = set(
|
|
327
|
-
api.list_repo_files(
|
|
328
|
-
repo_id=repo_id,
|
|
329
|
-
repo_type="dataset",
|
|
330
|
-
revision=old_revision,
|
|
331
|
-
)
|
|
332
|
-
)
|
|
333
|
-
|
|
334
|
-
# Get all files at new revision
|
|
335
|
-
new_files = set(
|
|
336
|
-
api.list_repo_files(
|
|
337
|
-
repo_id=repo_id,
|
|
338
|
-
repo_type="dataset",
|
|
339
|
-
revision=new_revision,
|
|
340
|
-
)
|
|
341
|
-
)
|
|
342
|
-
|
|
343
|
-
# Find added files (set difference)
|
|
344
|
-
added_files = new_files - old_files
|
|
345
|
-
|
|
346
|
-
# Filter for parquet files in this config
|
|
347
|
-
config_prefix = f"{config}/"
|
|
348
|
-
relative_paths = sorted(
|
|
349
|
-
f for f in added_files if f.endswith(".parquet") and f.startswith(config_prefix)
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
return relative_paths
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
def dataset_builder_safe(
|
|
356
|
-
repo_id: str,
|
|
357
|
-
config: str,
|
|
358
|
-
token: Optional[str] = None,
|
|
359
|
-
):
|
|
360
|
-
"""Load dataset builder while avoiding picking up local files.
|
|
361
|
-
|
|
362
|
-
Changes to a temporary directory before loading to ensure the datasets
|
|
363
|
-
library doesn't pick up local files in the current directory.
|
|
364
|
-
|
|
365
|
-
Args:
|
|
366
|
-
repo_id: HuggingFace dataset repository ID
|
|
367
|
-
config_name: Optional configuration name
|
|
368
|
-
token: Optional HuggingFace API token
|
|
369
|
-
|
|
370
|
-
Returns:
|
|
371
|
-
Dataset builder object
|
|
372
|
-
|
|
373
|
-
Raises:
|
|
374
|
-
Exception: If loading fails
|
|
375
|
-
"""
|
|
376
|
-
original_cwd = os.getcwd()
|
|
377
|
-
|
|
378
|
-
try:
|
|
379
|
-
# Change to a temporary directory to avoid dataset library picking up local files
|
|
380
|
-
with tempfile.TemporaryDirectory() as tmpdir:
|
|
381
|
-
os.chdir(tmpdir)
|
|
382
|
-
return load_dataset_builder(repo_id, config, token=token)
|
|
383
|
-
finally:
|
|
384
|
-
# Always restore the original directory
|
|
385
|
-
os.chdir(original_cwd)
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
def dataset_data_files(
|
|
389
|
-
data_files: Dict[str, List[str]], filter_paths: Optional[List[str]] = None
|
|
390
|
-
) -> Tuple[Dict[str, List[str]], str]:
|
|
391
|
-
"""Filter data files and extract data directory.
|
|
392
|
-
|
|
393
|
-
Optionally filters data files by path_in_repo and extracts the common directory path.
|
|
394
|
-
|
|
395
|
-
Args:
|
|
396
|
-
data_files: Dictionary mapping splits to lists of file URIs (with revision)
|
|
397
|
-
filter_paths: Optional list of path_in_repo strings to filter by
|
|
398
|
-
|
|
399
|
-
Returns:
|
|
400
|
-
Tuple of (filtered_data_files, data_dir)
|
|
401
|
-
|
|
402
|
-
Example:
|
|
403
|
-
>>> dataset_data_files(
|
|
404
|
-
... {"train": ["hf://datasets/repo@rev/plain_text/train-00000.parquet"]},
|
|
405
|
-
... filter_paths=["plain_text/train-00000.parquet"]
|
|
406
|
-
... )
|
|
407
|
-
({'train': ['hf://datasets/repo@rev/plain_text/train-00000.parquet']}, 'plain_text')
|
|
408
|
-
"""
|
|
409
|
-
fs = HfFileSystem()
|
|
410
|
-
|
|
411
|
-
# Convert filter_paths to set for fast lookup
|
|
412
|
-
filter_set = set(filter_paths) if filter_paths else None
|
|
413
|
-
|
|
414
|
-
# Filter data files if filter_paths provided
|
|
415
|
-
filtered_data_files = {}
|
|
416
|
-
all_files = []
|
|
417
|
-
|
|
418
|
-
for split, file_list in data_files.items():
|
|
419
|
-
filtered_files = []
|
|
420
|
-
for file_uri in file_list:
|
|
421
|
-
resolved = fs.resolve_path(file_uri)
|
|
422
|
-
path_in_repo = resolved.path_in_repo
|
|
423
|
-
|
|
424
|
-
# Include file if no filter or if in filter set
|
|
425
|
-
if filter_set is None or path_in_repo in filter_set:
|
|
426
|
-
filtered_files.append(file_uri)
|
|
427
|
-
all_files.append(path_in_repo)
|
|
428
|
-
|
|
429
|
-
if filtered_files:
|
|
430
|
-
filtered_data_files[split] = filtered_files
|
|
431
|
-
|
|
432
|
-
if not all_files:
|
|
433
|
-
raise ValueError("No data files found to determine data directory")
|
|
434
|
-
|
|
435
|
-
try:
|
|
436
|
-
# Extract directory from each file path, then find common directory
|
|
437
|
-
# This ensures we get a directory path, not a file path (which would happen
|
|
438
|
-
# with os.path.commonpath when there's only one file)
|
|
439
|
-
directories = [os.path.dirname(f) for f in all_files]
|
|
440
|
-
data_dir = os.path.commonpath(directories)
|
|
441
|
-
except ValueError as e:
|
|
442
|
-
raise ValueError(
|
|
443
|
-
f"Unable to determine common data directory from files: {all_files}"
|
|
444
|
-
) from e
|
|
445
|
-
|
|
446
|
-
return filtered_data_files, data_dir
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
# =============================================================================
|
|
450
|
-
# Dataset Discovery and Bridging
|
|
451
|
-
# =============================================================================
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
@dataclass
|
|
455
|
-
class DatasetInfo:
|
|
456
|
-
"""Information about a HuggingFace dataset.
|
|
457
|
-
|
|
458
|
-
This class discovers and represents the structure of a HuggingFace dataset,
|
|
459
|
-
including its configuration, splits, and Parquet files. It serves as the
|
|
460
|
-
discovery layer that gathers all necessary information before conversion
|
|
461
|
-
to Iceberg format.
|
|
462
|
-
"""
|
|
463
|
-
|
|
464
|
-
repo_id: str
|
|
465
|
-
config: str
|
|
466
|
-
splits: List[str]
|
|
467
|
-
data_files: Dict[str, List[str]] # split -> list of fully qualified URIs (with revision)
|
|
468
|
-
data_dir: str
|
|
469
|
-
features: Features # HuggingFace dataset features
|
|
470
|
-
revision: str # Git revision/SHA of the dataset
|
|
471
|
-
|
|
472
|
-
@classmethod
|
|
473
|
-
def discover(
|
|
474
|
-
cls,
|
|
475
|
-
repo_id: str,
|
|
476
|
-
config: str,
|
|
477
|
-
token: Optional[str] = None,
|
|
478
|
-
since_revision: Optional[str] = None,
|
|
479
|
-
) -> "DatasetInfo":
|
|
480
|
-
"""Discover Parquet files and structure in a HuggingFace dataset.
|
|
481
|
-
|
|
482
|
-
Discovery process:
|
|
483
|
-
1. Validate config exists in dataset
|
|
484
|
-
2. Load dataset builder to get metadata
|
|
485
|
-
3. Extract splits from builder
|
|
486
|
-
4. Get data files (fully qualified URIs with revision)
|
|
487
|
-
- If since_revision is provided, only get files added since that revision
|
|
488
|
-
- Otherwise, get all files from builder
|
|
489
|
-
5. Get dataset revision (SHA) from Hub
|
|
490
|
-
6. Extract data directory from config or URIs
|
|
491
|
-
7. Return DatasetInfo with all metadata
|
|
492
|
-
|
|
493
|
-
Args:
|
|
494
|
-
repo_id: HuggingFace dataset repository ID (e.g., "kszucs/dataset1")
|
|
495
|
-
config: Configuration name to discover
|
|
496
|
-
token: HuggingFace API token (uses HF_TOKEN env var if not provided)
|
|
497
|
-
since_revision: Optional revision SHA to get only files added since that revision
|
|
498
|
-
|
|
499
|
-
Returns:
|
|
500
|
-
DatasetInfo with discovered structure
|
|
501
|
-
|
|
502
|
-
Raises:
|
|
503
|
-
ValueError: If dataset not found or config doesn't exist
|
|
504
|
-
"""
|
|
505
|
-
try:
|
|
506
|
-
builder = dataset_builder_safe(repo_id, config=config, token=token)
|
|
507
|
-
except Exception as e:
|
|
508
|
-
raise ValueError(
|
|
509
|
-
f"Dataset {repo_id} config {config} not found or not accessible: {e}"
|
|
510
|
-
) from e
|
|
511
|
-
|
|
512
|
-
revision = builder.hash
|
|
513
|
-
features = builder.info.features
|
|
514
|
-
|
|
515
|
-
# Get filter paths if since_revision is provided
|
|
516
|
-
filter_paths = None
|
|
517
|
-
if since_revision:
|
|
518
|
-
filter_paths = dataset_new_files(
|
|
519
|
-
repo_id=repo_id,
|
|
520
|
-
config=config,
|
|
521
|
-
old_revision=since_revision,
|
|
522
|
-
new_revision=revision,
|
|
523
|
-
token=token,
|
|
524
|
-
)
|
|
525
|
-
|
|
526
|
-
# Filter data files and extract data directory
|
|
527
|
-
data_files, data_dir = dataset_data_files(
|
|
528
|
-
builder.config.data_files, filter_paths=filter_paths
|
|
529
|
-
)
|
|
530
|
-
|
|
531
|
-
splits = list(data_files.keys())
|
|
532
|
-
|
|
533
|
-
if not data_files:
|
|
534
|
-
raise ValueError("No Parquet files found in dataset configuration")
|
|
535
|
-
|
|
536
|
-
return cls(
|
|
537
|
-
repo_id=repo_id,
|
|
538
|
-
config=config,
|
|
539
|
-
splits=splits,
|
|
540
|
-
data_files=data_files, # Store fully qualified URIs
|
|
541
|
-
data_dir=data_dir,
|
|
542
|
-
features=features,
|
|
543
|
-
revision=revision,
|
|
544
|
-
)
|
|
545
|
-
|
|
546
|
-
def to_table_info(
|
|
547
|
-
self,
|
|
548
|
-
namespace: str,
|
|
549
|
-
table_name: str,
|
|
550
|
-
) -> TableInfo:
|
|
551
|
-
"""Convert DatasetInfo to TableInfo.
|
|
552
|
-
|
|
553
|
-
This method creates table metadata for the HuggingFace dataset config
|
|
554
|
-
with an explicit table name, supporting the namespace-based configuration.
|
|
555
|
-
|
|
556
|
-
Args:
|
|
557
|
-
namespace: Iceberg namespace for the table
|
|
558
|
-
table_name: Explicit table name (no auto-generation)
|
|
559
|
-
|
|
560
|
-
Returns:
|
|
561
|
-
TableInfo object
|
|
562
|
-
"""
|
|
563
|
-
# Build Iceberg schema with split column
|
|
564
|
-
schema = iceberg_schema_from_features(self.features, include_split_column=True)
|
|
565
|
-
|
|
566
|
-
# Build partition spec (partitioned by split)
|
|
567
|
-
partition_spec = iceberg_partition_spec(schema)
|
|
568
|
-
|
|
569
|
-
# Collect file information with fully qualified URIs
|
|
570
|
-
files = []
|
|
571
|
-
for split, file_uris in self.data_files.items():
|
|
572
|
-
for uri in file_uris:
|
|
573
|
-
files.append(FileInfo(uri=uri, split=split))
|
|
574
|
-
|
|
575
|
-
# Create TableInfo with explicit naming
|
|
576
|
-
return TableInfo(
|
|
577
|
-
namespace=namespace,
|
|
578
|
-
table_name=table_name, # Direct from config, no auto-generation
|
|
579
|
-
schema=schema,
|
|
580
|
-
partition_spec=partition_spec,
|
|
581
|
-
data_files=files,
|
|
582
|
-
data_dir=self.data_dir,
|
|
583
|
-
dataset_repo=self.repo_id,
|
|
584
|
-
dataset_config=self.config,
|
|
585
|
-
dataset_revision=self.revision,
|
|
586
|
-
)
|