faceberg 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
faceberg/convert.py DELETED
@@ -1,813 +0,0 @@
1
- """Conversion from TableInfo to Iceberg metadata files.
2
-
3
- This module takes TableInfo objects (created by the bridge layer) and converts them
4
- into actual Iceberg metadata files in metadata-only mode, referencing the original
5
- HuggingFace dataset files.
6
- """
7
-
8
- import logging
9
- import time
10
- import uuid
11
- from pathlib import Path
12
- from typing import Callable, Dict, List, Optional
13
-
14
- import pyarrow.parquet as pq
15
- from huggingface_hub import get_hf_file_metadata, hf_hub_url
16
- from pyiceberg.io.pyarrow import PyArrowFileIO
17
- from pyiceberg.manifest import (
18
- DataFile,
19
- DataFileContent,
20
- FileFormat,
21
- ManifestEntry,
22
- ManifestEntryStatus,
23
- ManifestFile,
24
- write_manifest,
25
- write_manifest_list,
26
- )
27
- from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
28
- from pyiceberg.schema import Schema
29
- from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER, TableMetadataV2, new_table_metadata
30
- from pyiceberg.table.refs import SnapshotRef, SnapshotRefType
31
- from pyiceberg.table.snapshots import Operation, Snapshot, Summary
32
- from pyiceberg.table.sorting import UNSORTED_SORT_ORDER
33
- from pyiceberg.transforms import IdentityTransform
34
-
35
- # Import FileInfo (created by bridge layer)
36
- from .bridge import FileInfo
37
-
38
- logger = logging.getLogger(__name__)
39
-
40
-
41
- # TODO(kszucs): parallelize metadata creation for large number of files
42
-
43
-
44
- class IcebergMetadataWriter:
45
- """Writes Iceberg metadata files in metadata-only mode.
46
-
47
- This writer creates Iceberg metadata (manifest, manifest list, table metadata)
48
- that references existing HuggingFace dataset files without copying or modifying them.
49
- """
50
-
51
- def __init__(
52
- self,
53
- table_path: Path,
54
- schema: Schema,
55
- partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
56
- base_uri: str = None,
57
- ):
58
- """Initialize metadata writer.
59
-
60
- Args:
61
- table_path: Local directory for physically writing files (staging directory)
62
- schema: Iceberg schema
63
- partition_spec: Partition specification
64
- base_uri: Base URI for paths stored in metadata
65
- (e.g., "file:///path/to/catalog" or "hf://datasets/org/repo")
66
- """
67
- self.table_path = table_path
68
- self.schema = schema
69
- self.partition_spec = partition_spec
70
- self.metadata_dir = table_path / "metadata"
71
- self.metadata_dir.mkdir(parents=True, exist_ok=True)
72
- self.file_io = PyArrowFileIO()
73
-
74
- # Store base URI for metadata references
75
- self.base_uri = base_uri.rstrip("/")
76
-
77
- def create_metadata_from_files(
78
- self,
79
- file_infos: List[FileInfo],
80
- table_uuid: str,
81
- properties: Optional[Dict[str, str]] = None,
82
- progress_callback: Optional[Callable] = None,
83
- identifier: Optional[str] = None,
84
- ) -> Path:
85
- """Create Iceberg metadata from data file information.
86
-
87
- This method creates all necessary Iceberg metadata files:
88
- - Manifest file (.avro)
89
- - Manifest list (.avro)
90
- - Table metadata (v1.metadata.json)
91
- - Version hint (version-hint.text)
92
-
93
- Args:
94
- file_infos: List of FileInfo objects describing data files
95
- table_uuid: UUID for the table
96
- properties: Optional table properties
97
- progress_callback: Optional callback for progress updates
98
- identifier: Optional table identifier for progress reporting
99
-
100
- Returns:
101
- Path to the metadata file
102
- """
103
- logger.info(f"Creating Iceberg metadata for {len(file_infos)} files")
104
-
105
- # Step 1: Read file metadata from HuggingFace Hub
106
- enriched_files = self._read_file_metadata(
107
- file_infos, progress_callback=progress_callback, identifier=identifier
108
- )
109
-
110
- # Step 2: Create DataFile entries
111
- data_files = self._create_data_files(enriched_files)
112
-
113
- # Step 3: Write metadata files
114
- return self._write_metadata_files(data_files, table_uuid, properties or {})
115
-
116
- def _get_hf_file_size(self, file_path: str) -> int:
117
- """Get the actual file size from HuggingFace Hub.
118
-
119
- This queries the HuggingFace API to get the exact file size. While we could
120
- calculate an approximate size from Parquet metadata, the calculation is not
121
- exact enough for DuckDB's iceberg_scan which needs precise file sizes.
122
-
123
- Args:
124
- file_path: HuggingFace file path in format hf://datasets/repo_id/path/to/file
125
-
126
- Returns:
127
- File size in bytes
128
-
129
- Raises:
130
- ValueError: If file path cannot be parsed or file size cannot be determined
131
- """
132
- # Parse hf:// URL - format is hf://datasets/org/repo@revision/path/to/file
133
- # or hf://datasets/org/repo/path/to/file (without revision)
134
- if not file_path.startswith("hf://datasets/"):
135
- raise ValueError(f"Invalid HuggingFace file path: {file_path}")
136
-
137
- # Split into repo_id@revision (org/repo@revision) and filename (path/to/file)
138
- remaining = file_path[len("hf://datasets/") :]
139
- parts = remaining.split("/")
140
- if len(parts) < 3:
141
- raise ValueError(f"Invalid HuggingFace file path format: {file_path}")
142
-
143
- # Handle repo_id with optional @revision
144
- repo_part = f"{parts[0]}/{parts[1]}" # org/repo@revision or org/repo
145
- if "@" in repo_part:
146
- repo_id, revision = repo_part.split("@", 1)
147
- else:
148
- repo_id = repo_part
149
- revision = None
150
-
151
- filename = "/".join(parts[2:]) # path/to/file
152
- url = hf_hub_url(repo_id=repo_id, filename=filename, repo_type="dataset", revision=revision)
153
- metadata = get_hf_file_metadata(url)
154
- return metadata.size
155
-
156
- def _read_file_metadata(
157
- self,
158
- file_infos: List[FileInfo],
159
- progress_callback: Optional[Callable] = None,
160
- identifier: Optional[str] = None,
161
- ) -> List[FileInfo]:
162
- """Read metadata from HuggingFace Hub files without downloading.
163
-
164
- Args:
165
- file_infos: List of FileInfo objects (may have size/row_count = 0)
166
- progress_callback: Optional callback for progress updates
167
- identifier: Optional table identifier for progress reporting
168
-
169
- Returns:
170
- List of FileInfo objects with enriched metadata
171
-
172
- Raises:
173
- Exception: If metadata cannot be read from any file
174
- """
175
- enriched = []
176
- total_files = len(file_infos)
177
-
178
- for i, file_info in enumerate(file_infos):
179
- # Read metadata directly from HF Hub without downloading the file
180
- metadata = pq.read_metadata(file_info.uri)
181
- row_count = metadata.num_rows
182
-
183
- # Use provided size if available, otherwise get from HuggingFace API
184
- file_size = file_info.size_bytes
185
- if not file_size:
186
- # Get exact file size from HuggingFace Hub API
187
- file_size = self._get_hf_file_size(file_info.uri)
188
-
189
- enriched.append(
190
- FileInfo(
191
- uri=file_info.uri,
192
- split=file_info.split,
193
- size_bytes=file_size,
194
- row_count=row_count,
195
- )
196
- )
197
-
198
- # Report progress after processing each file
199
- if progress_callback and identifier:
200
- percent = 10 + int((i + 1) / total_files * 80)
201
- progress_callback(identifier, state="in_progress", percent=percent)
202
-
203
- return enriched
204
-
205
- def _create_data_files(
206
- self,
207
- file_infos: List[FileInfo],
208
- sequence_number: int = INITIAL_SEQUENCE_NUMBER,
209
- previous_data_files: Optional[List[DataFile]] = None,
210
- ) -> List[DataFile]:
211
- """Create Iceberg DataFile entries from file information.
212
-
213
- Args:
214
- file_infos: List of FileInfo objects with metadata
215
- sequence_number: Current sequence number (default: 0 for initial snapshot)
216
- previous_data_files: Optional list of data files from previous snapshot for
217
- inheritance tracking
218
-
219
- Returns:
220
- List of Iceberg DataFile objects
221
- """
222
- # Build lookup of previous files by path for inheritance checking
223
- previous_files_map = {}
224
- if previous_data_files:
225
- for prev_file in previous_data_files:
226
- previous_files_map[prev_file.file_path] = prev_file
227
-
228
- data_files = []
229
-
230
- for file_info in file_infos:
231
- # Build partition values based on the partition spec
232
- # Partition dict maps from partition field position to the partition value
233
- if self.partition_spec != UNPARTITIONED_PARTITION_SPEC and file_info.split:
234
- # Use position 0 for the first (and only) partition field
235
- # Convert split to string (it may be a NamedSplit object from HuggingFace)
236
- partition = {0: str(file_info.split)}
237
- else:
238
- partition = {}
239
-
240
- # Determine file_sequence_number: inherit from previous snapshot if file unchanged
241
- prev_file = previous_files_map.get(file_info.uri)
242
- if prev_file and self._files_identical(prev_file, file_info):
243
- # File unchanged - inherit sequence number
244
- file_seq_num = prev_file.file_sequence_number
245
- else:
246
- # File is new or modified - use current sequence number
247
- file_seq_num = sequence_number
248
-
249
- data_file = DataFile.from_args(
250
- content=DataFileContent.DATA,
251
- file_path=file_info.uri,
252
- file_format=FileFormat.PARQUET,
253
- partition=partition,
254
- record_count=file_info.row_count,
255
- file_size_in_bytes=file_info.size_bytes,
256
- file_sequence_number=file_seq_num, # Track inheritance
257
- column_sizes={},
258
- value_counts={},
259
- null_value_counts={},
260
- nan_value_counts={},
261
- lower_bounds={},
262
- upper_bounds={},
263
- key_metadata=None,
264
- split_offsets=None,
265
- equality_ids=None,
266
- sort_order_id=None,
267
- )
268
- data_files.append(data_file)
269
-
270
- return data_files
271
-
272
- def _files_identical(self, prev_file: DataFile, current_file: FileInfo) -> bool:
273
- """Check if file is unchanged between snapshots.
274
-
275
- Args:
276
- prev_file: DataFile from previous snapshot
277
- current_file: FileInfo for current file
278
-
279
- Returns:
280
- True if file is unchanged
281
- """
282
- return (
283
- prev_file.file_size_in_bytes == current_file.size_bytes
284
- and prev_file.record_count == current_file.row_count
285
- )
286
-
287
- def _get_previous_manifests(self, metadata: TableMetadataV2) -> Optional[List[ManifestFile]]:
288
- """Extract manifest file references from the current snapshot without reading
289
- their contents.
290
-
291
- This method is used for fast append operations to reuse existing manifest files
292
- without downloading and reading their contents. This significantly reduces
293
- bandwidth and improves performance for remote catalogs.
294
-
295
- Args:
296
- metadata: Current table metadata
297
-
298
- Returns:
299
- List of ManifestFile objects from current snapshot, or None if no snapshots
300
- """
301
- # Return None if there's no current snapshot
302
- if not metadata.current_snapshot_id or not metadata.snapshots:
303
- return None
304
-
305
- # Find the current snapshot
306
- current_snapshot = next(
307
- (s for s in metadata.snapshots if s.snapshot_id == metadata.current_snapshot_id),
308
- None,
309
- )
310
- if not current_snapshot:
311
- return None
312
-
313
- # Return manifest file references (without reading their contents)
314
- return list(current_snapshot.manifests(self.file_io))
315
-
316
- def _write_metadata_files(
317
- self,
318
- data_files: List[DataFile],
319
- table_uuid: str,
320
- properties: Dict[str, str],
321
- ) -> Path:
322
- """Write Iceberg table metadata, manifest, and manifest list.
323
-
324
- Args:
325
- data_files: List of data files
326
- table_uuid: UUID for the table
327
- properties: Table properties
328
-
329
- Returns:
330
- Path to the metadata file
331
- """
332
- # Step 1: Create and write manifest
333
- manifest_file = self._create_manifest(data_files)
334
-
335
- # Step 2: Create snapshot (with properties for summary fields)
336
- snapshot = self._create_snapshot(data_files, properties)
337
-
338
- # Step 3: Write manifest list
339
- self._write_manifest_list(snapshot, [manifest_file])
340
-
341
- # Step 4: Create table metadata
342
- metadata = self._create_table_metadata(snapshot, table_uuid, properties)
343
-
344
- # Step 5: Write metadata file
345
- return self._write_metadata_file(metadata)
346
-
347
- def _create_manifest(self, data_files: List[DataFile]):
348
- """Create and write manifest file.
349
-
350
- Args:
351
- data_files: List of data files
352
-
353
- Returns:
354
- ManifestFile object
355
- """
356
- manifest_filename = f"{uuid.uuid4()}.avro"
357
- manifest_write_path = self.metadata_dir / manifest_filename
358
- manifest_uri = f"{self.base_uri.rstrip('/')}/metadata/{manifest_filename}"
359
- output_file = self.file_io.new_output(str(manifest_write_path))
360
-
361
- with write_manifest(
362
- format_version=2,
363
- spec=self.partition_spec,
364
- schema=self.schema,
365
- output_file=output_file,
366
- snapshot_id=1,
367
- avro_compression="deflate",
368
- ) as writer:
369
- for data_file in data_files:
370
- entry = ManifestEntry.from_args(
371
- status=ManifestEntryStatus.ADDED,
372
- snapshot_id=1,
373
- sequence_number=INITIAL_SEQUENCE_NUMBER,
374
- file_sequence_number=INITIAL_SEQUENCE_NUMBER,
375
- data_file=data_file,
376
- )
377
- writer.add_entry(entry)
378
-
379
- original_manifest = writer.to_manifest_file()
380
-
381
- # Create a new ManifestFile with URI path for metadata references
382
- return ManifestFile.from_args(
383
- manifest_path=manifest_uri,
384
- manifest_length=original_manifest.manifest_length,
385
- partition_spec_id=original_manifest.partition_spec_id,
386
- content=original_manifest.content,
387
- sequence_number=original_manifest.sequence_number,
388
- min_sequence_number=original_manifest.min_sequence_number,
389
- added_snapshot_id=original_manifest.added_snapshot_id,
390
- added_files_count=original_manifest.added_files_count,
391
- existing_files_count=original_manifest.existing_files_count,
392
- deleted_files_count=original_manifest.deleted_files_count,
393
- added_rows_count=original_manifest.added_rows_count,
394
- existing_rows_count=original_manifest.existing_rows_count,
395
- deleted_rows_count=original_manifest.deleted_rows_count,
396
- partitions=original_manifest.partitions,
397
- key_metadata=original_manifest.key_metadata,
398
- )
399
-
400
- def _create_snapshot(
401
- self, data_files: List[DataFile], properties: Optional[Dict[str, str]] = None
402
- ) -> Snapshot:
403
- """Create snapshot object.
404
-
405
- Args:
406
- data_files: List of data files
407
- properties: Optional table properties (used to populate snapshot summary)
408
-
409
- Returns:
410
- Snapshot object
411
- """
412
- properties = properties or {}
413
- total_records = sum(df.record_count for df in data_files)
414
- manifest_filename = f"snap-1-1-{uuid.uuid4()}.avro"
415
-
416
- # Build summary with standard fields + huggingface metadata
417
- summary_fields = {
418
- "added-data-files": str(len(data_files)),
419
- "added-records": str(total_records),
420
- "total-data-files": str(len(data_files)),
421
- "total-records": str(total_records),
422
- }
423
-
424
- # Add hf.* fields from properties to snapshot summary
425
- if "hf.dataset.repo" in properties:
426
- summary_fields["hf.dataset.repo"] = properties["hf.dataset.repo"]
427
- if "hf.dataset.config" in properties:
428
- summary_fields["hf.dataset.config"] = properties["hf.dataset.config"]
429
- if "hf.dataset.revision" in properties:
430
- revision = properties["hf.dataset.revision"]
431
- summary_fields["hf.dataset.revision"] = revision
432
- # Add short revision (first 7 chars)
433
- summary_fields["hf.dataset.revision.short"] = revision[:7]
434
-
435
- return Snapshot( # type: ignore[call-arg]
436
- snapshot_id=1,
437
- parent_snapshot_id=None,
438
- sequence_number=INITIAL_SEQUENCE_NUMBER,
439
- timestamp_ms=1,
440
- manifest_list=f"{self.base_uri.rstrip('/')}/metadata/{manifest_filename}",
441
- summary=Summary(
442
- operation=Operation.APPEND,
443
- **summary_fields,
444
- ),
445
- schema_id=self.schema.schema_id,
446
- )
447
-
448
- def _write_manifest_list(self, snapshot: Snapshot, manifest_files: List):
449
- """Write manifest list file.
450
-
451
- Args:
452
- snapshot: Snapshot object
453
- manifest_files: List of manifest files
454
- """
455
- # Get filename from the snapshot's manifest_list path and write to staging directory
456
- manifest_list_filename = Path(snapshot.manifest_list).name
457
- manifest_list_write_path = self.metadata_dir / manifest_list_filename
458
- manifest_list_output = self.file_io.new_output(str(manifest_list_write_path))
459
-
460
- with write_manifest_list(
461
- format_version=2,
462
- output_file=manifest_list_output,
463
- snapshot_id=snapshot.snapshot_id,
464
- parent_snapshot_id=snapshot.parent_snapshot_id,
465
- sequence_number=snapshot.sequence_number,
466
- avro_compression="deflate",
467
- ) as manifest_list_writer:
468
- manifest_list_writer.add_manifests(manifest_files)
469
-
470
- def _create_table_metadata(
471
- self,
472
- snapshot: Snapshot,
473
- table_uuid: str,
474
- properties: Dict[str, str],
475
- ) -> TableMetadataV2:
476
- """Create table metadata object.
477
-
478
- Args:
479
- snapshot: Snapshot object
480
- table_uuid: UUID for the table
481
- properties: Table properties
482
-
483
- Returns:
484
- TableMetadataV2 object
485
- """
486
- # Create initial metadata
487
- metadata = new_table_metadata(
488
- schema=self.schema,
489
- partition_spec=self.partition_spec,
490
- sort_order=UNSORTED_SORT_ORDER,
491
- location=self.base_uri,
492
- properties=properties,
493
- table_uuid=uuid.UUID(table_uuid),
494
- )
495
-
496
- # Update partition spec with correct field IDs if partitioned
497
- if self.partition_spec != UNPARTITIONED_PARTITION_SPEC:
498
- # Get the reassigned schema from metadata
499
- reassigned_schema = metadata.schema()
500
- split_field = reassigned_schema.find_field("split")
501
- if split_field:
502
- # Create partition spec with correct source_id
503
- partition_spec_with_correct_ids = PartitionSpec(
504
- PartitionField(
505
- source_id=split_field.field_id,
506
- field_id=1000,
507
- transform=IdentityTransform(),
508
- name="split",
509
- ),
510
- spec_id=0,
511
- )
512
- # Update metadata with correct partition spec
513
- metadata = TableMetadataV2( # type: ignore[call-arg]
514
- location=metadata.location,
515
- table_uuid=metadata.table_uuid,
516
- last_updated_ms=metadata.last_updated_ms,
517
- last_column_id=metadata.last_column_id,
518
- schemas=metadata.schemas,
519
- current_schema_id=metadata.current_schema_id,
520
- partition_specs=[partition_spec_with_correct_ids],
521
- default_spec_id=0,
522
- last_partition_id=1000,
523
- properties=metadata.properties,
524
- current_snapshot_id=None,
525
- snapshots=[],
526
- snapshot_log=[],
527
- metadata_log=[],
528
- sort_orders=metadata.sort_orders,
529
- default_sort_order_id=metadata.default_sort_order_id,
530
- refs={},
531
- format_version=2,
532
- last_sequence_number=INITIAL_SEQUENCE_NUMBER,
533
- )
534
-
535
- # Update metadata with snapshot
536
- return TableMetadataV2( # type: ignore[call-arg]
537
- location=metadata.location,
538
- table_uuid=metadata.table_uuid,
539
- last_updated_ms=metadata.last_updated_ms,
540
- last_column_id=metadata.last_column_id,
541
- schemas=metadata.schemas,
542
- current_schema_id=metadata.current_schema_id,
543
- partition_specs=metadata.partition_specs,
544
- default_spec_id=metadata.default_spec_id,
545
- last_partition_id=metadata.last_partition_id,
546
- properties=metadata.properties,
547
- current_snapshot_id=snapshot.snapshot_id,
548
- snapshots=[snapshot],
549
- snapshot_log=[],
550
- metadata_log=[],
551
- sort_orders=metadata.sort_orders,
552
- default_sort_order_id=metadata.default_sort_order_id,
553
- refs={
554
- "main": SnapshotRef( # type: ignore[call-arg]
555
- snapshot_id=snapshot.snapshot_id,
556
- type=SnapshotRefType.BRANCH,
557
- )
558
- },
559
- format_version=2,
560
- last_sequence_number=INITIAL_SEQUENCE_NUMBER,
561
- )
562
-
563
- def _write_metadata_file(self, metadata: TableMetadataV2) -> Path:
564
- """Write metadata file and version hint.
565
-
566
- Args:
567
- metadata: Table metadata object
568
-
569
- Returns:
570
- Path to the metadata file
571
- """
572
- # Write metadata file - DuckDB expects v1.metadata.json format
573
- metadata_file = self.metadata_dir / "v1.metadata.json"
574
- with open(metadata_file, "w") as f:
575
- f.write(metadata.model_dump_json(indent=2))
576
-
577
- # Write version hint - contains the version number (1)
578
- version_hint_file = self.metadata_dir / "version-hint.text"
579
- with open(version_hint_file, "w") as f:
580
- f.write("1")
581
-
582
- logger.info(f"Wrote metadata to {metadata_file}")
583
- return metadata_file
584
-
585
- def append_snapshot_from_files(
586
- self,
587
- file_infos: List[FileInfo],
588
- current_metadata: TableMetadataV2,
589
- properties: Optional[Dict[str, str]] = None,
590
- ) -> Path:
591
- """Append a new snapshot to existing table metadata.
592
-
593
- This method creates a new snapshot with updated files and writes
594
- the new metadata version.
595
-
596
- Args:
597
- file_infos: List of FileInfo objects describing new data files
598
- current_metadata: Current TableMetadataV2 object
599
- properties: Optional updated table properties
600
-
601
- Returns:
602
- Path to the new metadata file
603
- """
604
- logger.info(f"Appending snapshot with {len(file_infos)} files")
605
-
606
- # Enrich file metadata
607
- enriched_files = self._read_file_metadata(file_infos)
608
-
609
- # Skip inheritance tracking for pure appends (new files only)
610
- # The bridge layer already filtered to new files via revision diff,
611
- # so no need to compare with previous data files.
612
- # This saves 5-25 MB of manifest downloads for remote catalogs.
613
- previous_data_files = None
614
-
615
- # Calculate next IDs
616
- next_snapshot_id = max(snap.snapshot_id for snap in current_metadata.snapshots) + 1
617
- next_sequence_number = current_metadata.last_sequence_number + 1
618
-
619
- # Create DataFile entries (all get new sequence number)
620
- data_files = self._create_data_files(
621
- enriched_files, next_sequence_number, previous_data_files
622
- )
623
-
624
- # Merge properties first (needed for snapshot summary)
625
- merged_properties = {**current_metadata.properties}
626
- if properties:
627
- merged_properties.update(properties)
628
-
629
- # Write manifest for new files only
630
- new_manifest_file = self._write_manifest_with_ids(
631
- data_files, next_snapshot_id, next_sequence_number
632
- )
633
-
634
- # Create snapshot (with properties for summary fields)
635
- snapshot = self._create_snapshot_with_ids(
636
- data_files, next_snapshot_id, next_sequence_number, merged_properties
637
- )
638
-
639
- # Fast append: reuse previous manifests + add new manifest
640
- previous_manifests = self._get_previous_manifests(current_metadata)
641
- if previous_manifests:
642
- all_manifest_files = previous_manifests + [new_manifest_file]
643
- else:
644
- all_manifest_files = [new_manifest_file]
645
-
646
- # Write manifest list with all manifests
647
- self._write_manifest_list(snapshot, all_manifest_files)
648
-
649
- # Create updated metadata
650
- updated_metadata = TableMetadataV2( # type: ignore[call-arg]
651
- location=current_metadata.location,
652
- table_uuid=current_metadata.table_uuid,
653
- last_updated_ms=int(time.time() * 1000),
654
- last_column_id=current_metadata.last_column_id,
655
- schemas=current_metadata.schemas,
656
- current_schema_id=current_metadata.current_schema_id,
657
- partition_specs=current_metadata.partition_specs,
658
- default_spec_id=current_metadata.default_spec_id,
659
- last_partition_id=current_metadata.last_partition_id,
660
- properties=merged_properties,
661
- current_snapshot_id=snapshot.snapshot_id,
662
- snapshots=list(current_metadata.snapshots) + [snapshot],
663
- snapshot_log=current_metadata.snapshot_log,
664
- metadata_log=current_metadata.metadata_log,
665
- sort_orders=current_metadata.sort_orders,
666
- default_sort_order_id=current_metadata.default_sort_order_id,
667
- refs={
668
- "main": SnapshotRef( # type: ignore[call-arg]
669
- snapshot_id=snapshot.snapshot_id,
670
- type=SnapshotRefType.BRANCH,
671
- )
672
- },
673
- format_version=2,
674
- last_sequence_number=next_sequence_number,
675
- )
676
-
677
- # Write new metadata file
678
- return self._write_metadata_version(updated_metadata, next_sequence_number)
679
-
680
- def _write_manifest_with_ids(
681
- self, data_files: List[DataFile], snapshot_id: int, sequence_number: int
682
- ):
683
- """Write manifest file with specific IDs.
684
-
685
- Args:
686
- data_files: List of DataFile objects
687
- snapshot_id: Snapshot ID
688
- sequence_number: Sequence number
689
-
690
- Returns:
691
- ManifestFile object
692
- """
693
- manifest_filename = f"{uuid.uuid4()}.avro"
694
- manifest_write_path = self.metadata_dir / manifest_filename
695
- manifest_uri = f"{self.base_uri.rstrip('/')}/metadata/{manifest_filename}"
696
- output_file = self.file_io.new_output(str(manifest_write_path))
697
-
698
- with write_manifest(
699
- format_version=2,
700
- spec=self.partition_spec,
701
- schema=self.schema,
702
- output_file=output_file,
703
- snapshot_id=snapshot_id,
704
- avro_compression="deflate",
705
- ) as writer:
706
- for data_file in data_files:
707
- entry = ManifestEntry.from_args(
708
- status=ManifestEntryStatus.ADDED,
709
- snapshot_id=snapshot_id,
710
- sequence_number=sequence_number,
711
- file_sequence_number=sequence_number,
712
- data_file=data_file,
713
- )
714
- writer.add_entry(entry)
715
-
716
- original_manifest = writer.to_manifest_file()
717
-
718
- # Create a new ManifestFile with URI path for metadata references
719
- return ManifestFile.from_args(
720
- manifest_path=manifest_uri,
721
- manifest_length=original_manifest.manifest_length,
722
- partition_spec_id=original_manifest.partition_spec_id,
723
- content=original_manifest.content,
724
- sequence_number=original_manifest.sequence_number,
725
- min_sequence_number=original_manifest.min_sequence_number,
726
- added_snapshot_id=original_manifest.added_snapshot_id,
727
- added_files_count=original_manifest.added_files_count,
728
- existing_files_count=original_manifest.existing_files_count,
729
- deleted_files_count=original_manifest.deleted_files_count,
730
- added_rows_count=original_manifest.added_rows_count,
731
- existing_rows_count=original_manifest.existing_rows_count,
732
- deleted_rows_count=original_manifest.deleted_rows_count,
733
- partitions=original_manifest.partitions,
734
- key_metadata=original_manifest.key_metadata,
735
- )
736
-
737
- def _create_snapshot_with_ids(
738
- self,
739
- data_files: List[DataFile],
740
- snapshot_id: int,
741
- sequence_number: int,
742
- properties: Optional[Dict[str, str]] = None,
743
- ) -> Snapshot:
744
- """Create snapshot with specific IDs.
745
-
746
- Args:
747
- data_files: List of DataFile objects
748
- snapshot_id: Snapshot ID
749
- sequence_number: Sequence number
750
- properties: Optional table properties (used to populate snapshot summary)
751
-
752
- Returns:
753
- Snapshot object
754
- """
755
- properties = properties or {}
756
- total_records = sum(df.record_count for df in data_files)
757
-
758
- manifest_filename = f"snap-{snapshot_id}-{sequence_number}-{uuid.uuid4()}.avro"
759
-
760
- # Build summary with standard fields + huggingface metadata
761
- summary_fields = {
762
- "added-data-files": str(len(data_files)),
763
- "added-records": str(total_records),
764
- "total-data-files": str(len(data_files)),
765
- "total-records": str(total_records),
766
- }
767
-
768
- # Add hf.* fields from properties to snapshot summary
769
- if "hf.dataset.repo" in properties:
770
- summary_fields["hf.dataset.repo"] = properties["hf.dataset.repo"]
771
- if "hf.dataset.config" in properties:
772
- summary_fields["hf.dataset.config"] = properties["hf.dataset.config"]
773
- if "hf.dataset.revision" in properties:
774
- revision = properties["hf.dataset.revision"]
775
- summary_fields["hf.dataset.revision"] = revision
776
- # Add short revision (first 7 chars)
777
- summary_fields["hf.dataset.revision.short"] = revision[:7]
778
-
779
- return Snapshot( # type: ignore[call-arg]
780
- snapshot_id=snapshot_id,
781
- parent_snapshot_id=snapshot_id - 1,
782
- sequence_number=sequence_number,
783
- timestamp_ms=int(uuid.uuid4().time_low),
784
- manifest_list=f"{self.base_uri.rstrip('/')}/metadata/{manifest_filename}",
785
- summary=Summary(
786
- operation=Operation.APPEND,
787
- **summary_fields,
788
- ),
789
- schema_id=self.schema.schema_id,
790
- )
791
-
792
- def _write_metadata_version(self, metadata: TableMetadataV2, version: int) -> Path:
793
- """Write a specific metadata version.
794
-
795
- Args:
796
- metadata: TableMetadataV2 object
797
- version: Version number
798
-
799
- Returns:
800
- Path to the metadata file
801
- """
802
- # Write metadata file
803
- metadata_file = self.metadata_dir / f"v{version}.metadata.json"
804
- with open(metadata_file, "w") as f:
805
- f.write(metadata.model_dump_json(indent=2))
806
-
807
- # Update version hint
808
- version_hint_file = self.metadata_dir / "version-hint.text"
809
- with open(version_hint_file, "w") as f:
810
- f.write(str(version))
811
-
812
- logger.info(f"Wrote metadata version {version} to {metadata_file}")
813
- return metadata_file