faceberg 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
faceberg/bridge.py DELETED
@@ -1,586 +0,0 @@
1
- """Bridge between HuggingFace datasets and Apache Iceberg tables.
2
-
3
- This module discovers HuggingFace datasets and converts them to TableInfo objects
4
- that contain all the Iceberg metadata needed for table creation.
5
- """
6
-
7
- import json
8
- import os
9
- import tempfile
10
- from dataclasses import dataclass
11
- from typing import Dict, List, Optional, Tuple
12
-
13
- from datasets import (
14
- Features,
15
- load_dataset_builder,
16
- )
17
- from huggingface_hub import HfApi, HfFileSystem
18
- from pyiceberg.io.pyarrow import _pyarrow_to_schema_without_ids
19
- from pyiceberg.partitioning import PartitionField, PartitionSpec
20
- from pyiceberg.schema import Schema, assign_fresh_schema_ids
21
- from pyiceberg.transforms import IdentityTransform
22
- from pyiceberg.types import ListType, MapType, NestedField, StringType, StructType
23
-
24
- # =============================================================================
25
- # Bridge Output Classes
26
- # =============================================================================
27
-
28
-
29
- @dataclass
30
- class FileInfo:
31
- """Information about a data file in Iceberg table."""
32
-
33
- uri: str # Full hf:// URI to the file
34
- split: Optional[str] = None # Split name (train, test, validation, etc.)
35
- size_bytes: Optional[int] = None # File size in bytes (enriched later)
36
- row_count: Optional[int] = None # Number of rows in the file (enriched later)
37
-
38
-
39
- @dataclass
40
- class TableInfo:
41
- """Complete information needed to create an Iceberg table.
42
-
43
- This class serves as the output of the bridge layer, containing all the
44
- metadata needed to convert a HuggingFace dataset into an Iceberg table.
45
- """
46
-
47
- # Table identity
48
- namespace: str # Iceberg namespace (e.g., "default")
49
- table_name: str # Table name (e.g., "squad_plain_text")
50
-
51
- # Iceberg schema and partitioning
52
- schema: Schema # Iceberg schema with field IDs
53
- partition_spec: PartitionSpec # Partition specification
54
-
55
- # Data files
56
- data_files: List[FileInfo] # List of data files with metadata
57
- data_dir: str # Data directory path relative to repo root
58
-
59
- # Source metadata (for traceability)
60
- dataset_repo: str # HuggingFace repo ID
61
- dataset_config: str # Dataset configuration name
62
- dataset_revision: str # Git revision/SHA of the dataset
63
-
64
- @property
65
- def identifier(self) -> str:
66
- """Get table identifier in 'namespace.table_name' format."""
67
- return f"{self.namespace}.{self.table_name}"
68
-
69
- @property
70
- def total_rows(self) -> int:
71
- """Get total row count across all files."""
72
- return sum(f.row_count for f in self.data_files if f.row_count is not None)
73
-
74
- @property
75
- def total_size(self) -> int:
76
- """Get total size in bytes across all files."""
77
- return sum(f.size_bytes for f in self.data_files if f.size_bytes is not None)
78
-
79
- def get_table_properties(self) -> Dict[str, str]:
80
- """Get table properties for Iceberg metadata.
81
-
82
- Returns:
83
- Dictionary of table properties including source metadata and name mapping
84
- """
85
- # Create schema name mapping for Parquet files without embedded field IDs
86
- name_mapping = iceberg_name_mapping(self.schema)
87
-
88
- # Use data directory from discovery
89
- data_path = (
90
- f"hf://datasets/{self.dataset_repo}/{self.data_dir}"
91
- if self.data_dir
92
- else f"hf://datasets/{self.dataset_repo}"
93
- )
94
-
95
- # TODO(kszucs): split should be configurable
96
- properties = {
97
- "format-version": "3",
98
- "write.parquet.compression-codec": "snappy",
99
- "write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
100
- "write.data.path": data_path,
101
- # HuggingFace source metadata
102
- "hf.dataset.repo": self.dataset_repo,
103
- "hf.dataset.config": self.dataset_config,
104
- "hf.dataset.revision": self.dataset_revision,
105
- # Write configuration
106
- "hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
107
- "hf.write.split": "train",
108
- # Schema mapping
109
- "schema.name-mapping.default": json.dumps(name_mapping),
110
- }
111
-
112
- return properties
113
-
114
-
115
- # =============================================================================
116
- # Iceberg Helpers (Schema and Metadata)
117
- # =============================================================================
118
-
119
-
120
- def iceberg_field_mapping(field: NestedField) -> Dict[str, any]:
121
- """Build name mapping for a single field, recursively handling nested types.
122
-
123
- Args:
124
- field: Iceberg NestedField to create mapping for
125
-
126
- Returns:
127
- Dictionary containing field-id, names, and optionally nested fields
128
- """
129
- mapping = {
130
- "field-id": field.field_id,
131
- "names": [field.name],
132
- }
133
-
134
- # Handle nested types
135
- if isinstance(field.field_type, StructType):
136
- # Recursively map nested struct fields
137
- nested_fields = []
138
- for nested_field in field.field_type.fields:
139
- nested_fields.append(iceberg_field_mapping(nested_field))
140
- if nested_fields:
141
- mapping["fields"] = nested_fields
142
- elif isinstance(field.field_type, ListType):
143
- # Create mapping for the list element
144
- element_mapping = {
145
- "field-id": field.field_type.element_id,
146
- "names": ["element"],
147
- }
148
- # If element is a struct, recursively map its fields
149
- if isinstance(field.field_type.element_type, StructType):
150
- element_fields = []
151
- for nested_field in field.field_type.element_type.fields:
152
- element_fields.append(iceberg_field_mapping(nested_field))
153
- if element_fields:
154
- element_mapping["fields"] = element_fields
155
- mapping["fields"] = [element_mapping]
156
- elif isinstance(field.field_type, MapType):
157
- # Create mappings for key and value
158
- map_fields = []
159
-
160
- # Map the key
161
- key_mapping = {
162
- "field-id": field.field_type.key_id,
163
- "names": ["key"],
164
- }
165
- if isinstance(field.field_type.key_type, StructType):
166
- key_fields = []
167
- for nested_field in field.field_type.key_type.fields:
168
- key_fields.append(iceberg_field_mapping(nested_field))
169
- if key_fields:
170
- key_mapping["fields"] = key_fields
171
- map_fields.append(key_mapping)
172
-
173
- # Map the value
174
- value_mapping = {
175
- "field-id": field.field_type.value_id,
176
- "names": ["value"],
177
- }
178
- if isinstance(field.field_type.value_type, StructType):
179
- value_fields = []
180
- for nested_field in field.field_type.value_type.fields:
181
- value_fields.append(iceberg_field_mapping(nested_field))
182
- if value_fields:
183
- value_mapping["fields"] = value_fields
184
- map_fields.append(value_mapping)
185
-
186
- mapping["fields"] = map_fields
187
-
188
- return mapping
189
-
190
-
191
- def iceberg_name_mapping(schema: Schema) -> List[Dict[str, any]]:
192
- """Build Iceberg name mapping from schema, recursively handling nested fields.
193
-
194
- Name mapping is used to map Parquet column names to Iceberg field IDs for
195
- files that don't have embedded field IDs.
196
-
197
- Args:
198
- schema: Iceberg schema to create mapping for
199
-
200
- Returns:
201
- List of field mappings with field-id, names, and nested fields
202
- """
203
- fields = []
204
- for field in schema.fields:
205
- fields.append(iceberg_field_mapping(field))
206
- return fields
207
-
208
-
209
- def iceberg_partition_spec(schema: Schema) -> PartitionSpec:
210
- """Build a partition spec that uses 'split' as a partition key.
211
-
212
- This creates an identity partition on the split column, which means the split
213
- value will be stored in metadata and used for partition pruning.
214
-
215
- Args:
216
- schema: Iceberg schema containing a 'split' field
217
-
218
- Returns:
219
- PartitionSpec with split as partition key
220
-
221
- Raises:
222
- ValueError: If schema doesn't contain a 'split' field
223
- """
224
- split_field = schema.find_field("split")
225
- if split_field is None:
226
- raise ValueError("Schema must contain a 'split' field to create split partition spec")
227
-
228
- return PartitionSpec(
229
- PartitionField(
230
- source_id=split_field.field_id,
231
- field_id=1000, # Partition field IDs start at 1000
232
- transform=IdentityTransform(),
233
- name="split",
234
- ),
235
- spec_id=0,
236
- )
237
-
238
-
239
- def iceberg_schema_from_features(features, include_split_column: bool = True) -> Schema:
240
- """
241
- Build an Iceberg Schema from HuggingFace dataset features using Arrow as an intermediate format.
242
-
243
- This approach ensures globally unique field IDs across nested structures by leveraging
244
- PyIceberg's built-in conversion and ID assignment logic.
245
-
246
- Args:
247
- features: HuggingFace Features object or dict of features
248
- include_split_column: If True, adds a 'split' column to the schema (default: True)
249
-
250
- Returns:
251
- PyIceberg Schema object with globally unique field IDs
252
- """
253
- # Convert to Features if dict
254
- if isinstance(features, dict):
255
- features = Features(features)
256
-
257
- # Convert: Features → Arrow Schema → Iceberg Schema (without IDs) → Assign fresh IDs
258
- # This ensures globally unique field IDs across all nested structures
259
- arrow_schema = features.arrow_schema
260
- iceberg_schema_no_ids = _pyarrow_to_schema_without_ids(arrow_schema)
261
- schema = assign_fresh_schema_ids(iceberg_schema_no_ids)
262
-
263
- # Add split column as the first field if requested
264
- if include_split_column:
265
- # Create split field (will get ID 1 after reassignment)
266
- # Note: Although the schema uses StringType, the actual Parquet data
267
- # will use dictionary encoding (int8 indices) for compression efficiency
268
- # The split column is optional since it doesn't exist in the source Parquet files,
269
- # it's derived from partition metadata
270
- split_field = NestedField(
271
- field_id=-1, # Temporary ID, will be reassigned
272
- name="split",
273
- field_type=StringType(),
274
- required=False,
275
- )
276
- # Prepend split field to existing fields
277
- new_fields = [split_field] + list(schema.fields)
278
-
279
- # Create new schema and reassign all field IDs globally
280
- # This ensures field IDs are globally unique across nested structures
281
- schema_with_split = Schema(*new_fields)
282
- schema = assign_fresh_schema_ids(schema_with_split)
283
-
284
- return schema
285
-
286
-
287
- # =============================================================================
288
- # Dataset Helpers (HuggingFace)
289
- # =============================================================================
290
-
291
-
292
- def dataset_new_files(
293
- repo_id: str,
294
- config: str,
295
- old_revision: str,
296
- new_revision: str,
297
- token: Optional[str] = None,
298
- ) -> List[str]:
299
- """Find new parquet files added between two revisions.
300
-
301
- Uses HuggingFace Hub API to diff two git revisions and identify
302
- new parquet files for a specific dataset configuration.
303
-
304
- Args:
305
- repo_id: HuggingFace dataset repo ID (e.g., "squad")
306
- config: Dataset configuration name (e.g., "plain_text")
307
- old_revision: Previous commit SHA
308
- new_revision: Current commit SHA or branch (usually "main")
309
- token: HuggingFace API token
310
-
311
- Returns:
312
- List of path_in_repo strings for new parquet files
313
-
314
- Example:
315
- >>> dataset_new_files(
316
- ... "squad",
317
- ... "plain_text",
318
- ... "abc123",
319
- ... "def456"
320
- ... )
321
- ['plain_text/train-00000-of-00001.parquet', 'plain_text/validation-00000-of-00001.parquet']
322
- """
323
- api = HfApi(token=token)
324
-
325
- # Get all files at old revision
326
- old_files = set(
327
- api.list_repo_files(
328
- repo_id=repo_id,
329
- repo_type="dataset",
330
- revision=old_revision,
331
- )
332
- )
333
-
334
- # Get all files at new revision
335
- new_files = set(
336
- api.list_repo_files(
337
- repo_id=repo_id,
338
- repo_type="dataset",
339
- revision=new_revision,
340
- )
341
- )
342
-
343
- # Find added files (set difference)
344
- added_files = new_files - old_files
345
-
346
- # Filter for parquet files in this config
347
- config_prefix = f"{config}/"
348
- relative_paths = sorted(
349
- f for f in added_files if f.endswith(".parquet") and f.startswith(config_prefix)
350
- )
351
-
352
- return relative_paths
353
-
354
-
355
- def dataset_builder_safe(
356
- repo_id: str,
357
- config: str,
358
- token: Optional[str] = None,
359
- ):
360
- """Load dataset builder while avoiding picking up local files.
361
-
362
- Changes to a temporary directory before loading to ensure the datasets
363
- library doesn't pick up local files in the current directory.
364
-
365
- Args:
366
- repo_id: HuggingFace dataset repository ID
367
- config_name: Optional configuration name
368
- token: Optional HuggingFace API token
369
-
370
- Returns:
371
- Dataset builder object
372
-
373
- Raises:
374
- Exception: If loading fails
375
- """
376
- original_cwd = os.getcwd()
377
-
378
- try:
379
- # Change to a temporary directory to avoid dataset library picking up local files
380
- with tempfile.TemporaryDirectory() as tmpdir:
381
- os.chdir(tmpdir)
382
- return load_dataset_builder(repo_id, config, token=token)
383
- finally:
384
- # Always restore the original directory
385
- os.chdir(original_cwd)
386
-
387
-
388
- def dataset_data_files(
389
- data_files: Dict[str, List[str]], filter_paths: Optional[List[str]] = None
390
- ) -> Tuple[Dict[str, List[str]], str]:
391
- """Filter data files and extract data directory.
392
-
393
- Optionally filters data files by path_in_repo and extracts the common directory path.
394
-
395
- Args:
396
- data_files: Dictionary mapping splits to lists of file URIs (with revision)
397
- filter_paths: Optional list of path_in_repo strings to filter by
398
-
399
- Returns:
400
- Tuple of (filtered_data_files, data_dir)
401
-
402
- Example:
403
- >>> dataset_data_files(
404
- ... {"train": ["hf://datasets/repo@rev/plain_text/train-00000.parquet"]},
405
- ... filter_paths=["plain_text/train-00000.parquet"]
406
- ... )
407
- ({'train': ['hf://datasets/repo@rev/plain_text/train-00000.parquet']}, 'plain_text')
408
- """
409
- fs = HfFileSystem()
410
-
411
- # Convert filter_paths to set for fast lookup
412
- filter_set = set(filter_paths) if filter_paths else None
413
-
414
- # Filter data files if filter_paths provided
415
- filtered_data_files = {}
416
- all_files = []
417
-
418
- for split, file_list in data_files.items():
419
- filtered_files = []
420
- for file_uri in file_list:
421
- resolved = fs.resolve_path(file_uri)
422
- path_in_repo = resolved.path_in_repo
423
-
424
- # Include file if no filter or if in filter set
425
- if filter_set is None or path_in_repo in filter_set:
426
- filtered_files.append(file_uri)
427
- all_files.append(path_in_repo)
428
-
429
- if filtered_files:
430
- filtered_data_files[split] = filtered_files
431
-
432
- if not all_files:
433
- raise ValueError("No data files found to determine data directory")
434
-
435
- try:
436
- # Extract directory from each file path, then find common directory
437
- # This ensures we get a directory path, not a file path (which would happen
438
- # with os.path.commonpath when there's only one file)
439
- directories = [os.path.dirname(f) for f in all_files]
440
- data_dir = os.path.commonpath(directories)
441
- except ValueError as e:
442
- raise ValueError(
443
- f"Unable to determine common data directory from files: {all_files}"
444
- ) from e
445
-
446
- return filtered_data_files, data_dir
447
-
448
-
449
- # =============================================================================
450
- # Dataset Discovery and Bridging
451
- # =============================================================================
452
-
453
-
454
- @dataclass
455
- class DatasetInfo:
456
- """Information about a HuggingFace dataset.
457
-
458
- This class discovers and represents the structure of a HuggingFace dataset,
459
- including its configuration, splits, and Parquet files. It serves as the
460
- discovery layer that gathers all necessary information before conversion
461
- to Iceberg format.
462
- """
463
-
464
- repo_id: str
465
- config: str
466
- splits: List[str]
467
- data_files: Dict[str, List[str]] # split -> list of fully qualified URIs (with revision)
468
- data_dir: str
469
- features: Features # HuggingFace dataset features
470
- revision: str # Git revision/SHA of the dataset
471
-
472
- @classmethod
473
- def discover(
474
- cls,
475
- repo_id: str,
476
- config: str,
477
- token: Optional[str] = None,
478
- since_revision: Optional[str] = None,
479
- ) -> "DatasetInfo":
480
- """Discover Parquet files and structure in a HuggingFace dataset.
481
-
482
- Discovery process:
483
- 1. Validate config exists in dataset
484
- 2. Load dataset builder to get metadata
485
- 3. Extract splits from builder
486
- 4. Get data files (fully qualified URIs with revision)
487
- - If since_revision is provided, only get files added since that revision
488
- - Otherwise, get all files from builder
489
- 5. Get dataset revision (SHA) from Hub
490
- 6. Extract data directory from config or URIs
491
- 7. Return DatasetInfo with all metadata
492
-
493
- Args:
494
- repo_id: HuggingFace dataset repository ID (e.g., "kszucs/dataset1")
495
- config: Configuration name to discover
496
- token: HuggingFace API token (uses HF_TOKEN env var if not provided)
497
- since_revision: Optional revision SHA to get only files added since that revision
498
-
499
- Returns:
500
- DatasetInfo with discovered structure
501
-
502
- Raises:
503
- ValueError: If dataset not found or config doesn't exist
504
- """
505
- try:
506
- builder = dataset_builder_safe(repo_id, config=config, token=token)
507
- except Exception as e:
508
- raise ValueError(
509
- f"Dataset {repo_id} config {config} not found or not accessible: {e}"
510
- ) from e
511
-
512
- revision = builder.hash
513
- features = builder.info.features
514
-
515
- # Get filter paths if since_revision is provided
516
- filter_paths = None
517
- if since_revision:
518
- filter_paths = dataset_new_files(
519
- repo_id=repo_id,
520
- config=config,
521
- old_revision=since_revision,
522
- new_revision=revision,
523
- token=token,
524
- )
525
-
526
- # Filter data files and extract data directory
527
- data_files, data_dir = dataset_data_files(
528
- builder.config.data_files, filter_paths=filter_paths
529
- )
530
-
531
- splits = list(data_files.keys())
532
-
533
- if not data_files:
534
- raise ValueError("No Parquet files found in dataset configuration")
535
-
536
- return cls(
537
- repo_id=repo_id,
538
- config=config,
539
- splits=splits,
540
- data_files=data_files, # Store fully qualified URIs
541
- data_dir=data_dir,
542
- features=features,
543
- revision=revision,
544
- )
545
-
546
- def to_table_info(
547
- self,
548
- namespace: str,
549
- table_name: str,
550
- ) -> TableInfo:
551
- """Convert DatasetInfo to TableInfo.
552
-
553
- This method creates table metadata for the HuggingFace dataset config
554
- with an explicit table name, supporting the namespace-based configuration.
555
-
556
- Args:
557
- namespace: Iceberg namespace for the table
558
- table_name: Explicit table name (no auto-generation)
559
-
560
- Returns:
561
- TableInfo object
562
- """
563
- # Build Iceberg schema with split column
564
- schema = iceberg_schema_from_features(self.features, include_split_column=True)
565
-
566
- # Build partition spec (partitioned by split)
567
- partition_spec = iceberg_partition_spec(schema)
568
-
569
- # Collect file information with fully qualified URIs
570
- files = []
571
- for split, file_uris in self.data_files.items():
572
- for uri in file_uris:
573
- files.append(FileInfo(uri=uri, split=split))
574
-
575
- # Create TableInfo with explicit naming
576
- return TableInfo(
577
- namespace=namespace,
578
- table_name=table_name, # Direct from config, no auto-generation
579
- schema=schema,
580
- partition_spec=partition_spec,
581
- data_files=files,
582
- data_dir=self.data_dir,
583
- dataset_repo=self.repo_id,
584
- dataset_config=self.config,
585
- dataset_revision=self.revision,
586
- )