faceberg 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
faceberg/iceberg.py ADDED
@@ -0,0 +1,707 @@
1
+ """Iceberg metadata generation utilities.
2
+
3
+ This module provides functions for creating Apache Iceberg table metadata from
4
+ Parquet files. The main entry point is write_snapshot(), which takes a complete
5
+ list of files and generates all required Iceberg metadata (manifests, snapshots,
6
+ table metadata).
7
+
8
+ Data Flow
9
+ ---------
10
+ 1. User provides List[ParquetFile] representing desired snapshot state
11
+ 2. diff_snapshot() compares against previous to determine changes
12
+ 3. write_manifest() converts to Iceberg DataFiles and writes single manifest
13
+ 4. create_snapshot() reads manifest to build snapshot with statistics
14
+ 5. Table metadata written to JSON files
15
+
16
+ Key Concepts
17
+ ------------
18
+ ParquetFile: Simple dataclass with uri, size, and hash fields
19
+ ManifestEntry: Iceberg entry with status (ADDED/EXISTING/DELETED) + DataFile
20
+ Snapshot: Point-in-time view of table with summary statistics
21
+ Operation: Type of snapshot (APPEND/DELETE/OVERWRITE) determined by entry statuses
22
+
23
+ Public API
24
+ ----------
25
+ write_snapshot(): Main entry point for creating Iceberg metadata
26
+ create_schema(): Convert PyArrow schema to Iceberg schema with field IDs (optionally
27
+ with split column)
28
+ create_partition_spec(): Create a partition spec with optional split partitioning
29
+ ParquetFile: Dataclass representing a parquet file to include in snapshot
30
+
31
+ File Structure and Type Hierarchy
32
+ ----------------------------------
33
+
34
+ Physical Files Created:
35
+ table/
36
+ └── metadata/
37
+ ├── v1.metadata.json (TableMetadataV2)
38
+ ├── v2.metadata.json (TableMetadataV2) - for subsequent snapshots
39
+ ├── version-hint.text (current version number)
40
+ ├── snap-1-0-<uuid>.avro (ManifestList)
41
+ ├── snap-2-1-<uuid>.avro (ManifestList) - for subsequent snapshots
42
+ ├── <uuid>.avro (Manifest file)
43
+ └── <uuid>.avro (Manifest file) - one per snapshot
44
+
45
+ Type Hierarchy:
46
+ TableMetadataV2 # Root metadata object
47
+ ├── schemas: List[Schema] # Iceberg schema definitions
48
+ ├── partition_specs: List[PartitionSpec]
49
+ ├── snapshots: List[Snapshot] # All table snapshots
50
+ │ └── Snapshot
51
+ │ ├── snapshot_id: int
52
+ │ ├── manifest_list: str # → snap-X-Y-<uuid>.avro
53
+ │ └── summary: Summary # Operation stats + HF metadata
54
+ └── refs: Dict[str, SnapshotRef] # Branch references (e.g., "main")
55
+
56
+ ManifestList (snap-X-Y-<uuid>.avro) # Written to manifest_list path
57
+ └── manifests: List[ManifestFile] # References to manifest files
58
+ └── ManifestFile
59
+ ├── manifest_path: str # → <uuid>.avro
60
+ ├── added_files_count: int
61
+ ├── added_rows_count: int
62
+ └── partition_spec_id: int
63
+
64
+ Manifest (<uuid>.avro) # Written to manifest_path
65
+ └── entries: List[ManifestEntry]
66
+ └── ManifestEntry
67
+ ├── status: ManifestEntryStatus # ADDED/EXISTING/DELETED
68
+ ├── snapshot_id: int
69
+ ├── sequence_number: int
70
+ └── data_file: DataFile # ↓
71
+
72
+ DataFile # References actual data
73
+ ├── file_path: str # → hf://datasets/org/repo@rev/file.parquet
74
+ ├── file_format: FileFormat # PARQUET
75
+ ├── partition: Dict[int, str] # {0: "train"} for split partitioning
76
+ ├── record_count: int # Number of rows
77
+ ├── file_size_in_bytes: int
78
+ └── file_sequence_number: int # Tracks when file was added
79
+
80
+ Note: DataFile objects reference external HuggingFace parquet files without
81
+ copying them. All metadata files use Iceberg's Avro format for manifests and
82
+ JSON for table metadata.
83
+ """
84
+
85
+ import uuid
86
+ from pathlib import Path
87
+ from typing import Dict, List, Optional, Tuple
88
+
89
+ import pyarrow as pa
90
+ import pyarrow.parquet as pq
91
+ from pyiceberg.io import FileIO
92
+ from pyiceberg.io.pyarrow import (
93
+ PyArrowFileIO,
94
+ _pyarrow_to_schema_without_ids,
95
+ compute_statistics_plan,
96
+ data_file_statistics_from_parquet_metadata,
97
+ )
98
+ from pyiceberg.io.pyarrow import parquet_path_to_id_mapping as _parquet_path_to_id_mapping
99
+ from pyiceberg.manifest import (
100
+ DataFile,
101
+ DataFileContent,
102
+ ManifestEntry,
103
+ ManifestEntryStatus,
104
+ ManifestFile,
105
+ write_manifest_list,
106
+ )
107
+ from pyiceberg.manifest import write_manifest as _write_manifest
108
+ from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
109
+ from pyiceberg.schema import Schema, assign_fresh_schema_ids
110
+ from pyiceberg.table import TableProperties
111
+ from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER, TableMetadataV2, new_table_metadata
112
+ from pyiceberg.table.refs import SnapshotRef, SnapshotRefType
113
+ from pyiceberg.table.snapshots import (
114
+ Operation,
115
+ Snapshot,
116
+ SnapshotSummaryCollector,
117
+ Summary,
118
+ update_snapshot_summaries,
119
+ )
120
+ from pyiceberg.table.sorting import UNSORTED_SORT_ORDER
121
+ from pyiceberg.transforms import IdentityTransform
122
+ from pyiceberg.types import NestedField, StringType
123
+
124
+ from .discover import ParquetFile
125
+
126
+
127
+ def diff_snapshot(
128
+ current_files: List[ParquetFile],
129
+ previous_metadata: Optional[TableMetadataV2],
130
+ io: FileIO,
131
+ ) -> List[Tuple[ManifestEntryStatus, ParquetFile]]:
132
+ """Diff current files against previous snapshot.
133
+
134
+ Compares files based on uri/size/hash to determine status:
135
+ - ADDED: File exists in current but not in previous
136
+ - EXISTING: File exists in both with same size and hash
137
+ - REMOVED: File exists in previous but not in current
138
+ - REMOVED + ADDED: File exists in both but size/hash changed
139
+
140
+ Args:
141
+ current_files: List of current ParquetFile objects
142
+ previous_metadata: Previous table metadata (None for initial snapshot)
143
+ io: FileIO for reading previous manifests
144
+
145
+ Returns:
146
+ List of (status, ParquetFile) tuples
147
+ """
148
+ # If no previous metadata, all files are ADDED
149
+ if previous_metadata is None:
150
+ return [(ManifestEntryStatus.ADDED, pf) for pf in current_files]
151
+
152
+ # Build map of previous files: uri -> size
153
+ previous_snapshot = previous_metadata.snapshot_by_id(previous_metadata.current_snapshot_id)
154
+ if previous_snapshot is None:
155
+ return [(ManifestEntryStatus.ADDED, pf) for pf in current_files]
156
+
157
+ # Read all files from previous snapshot
158
+ previous_files_map: Dict[str, int] = {}
159
+ for manifest in previous_snapshot.manifests(io):
160
+ for entry in manifest.fetch_manifest_entry(io=io, discard_deleted=True):
161
+ df = entry.data_file
162
+ previous_files_map[df.file_path] = df.file_size_in_bytes
163
+
164
+ # Build map of current files
165
+ current_files_map: Dict[str, ParquetFile] = {pf.uri: pf for pf in current_files}
166
+
167
+ result: List[Tuple[ManifestEntryStatus, ParquetFile]] = []
168
+
169
+ # Process current files
170
+ for pf in current_files:
171
+ if pf.uri not in previous_files_map:
172
+ # New file
173
+ result.append((ManifestEntryStatus.ADDED, pf))
174
+ else:
175
+ prev_size = previous_files_map[pf.uri]
176
+ # Check if size changed (we don't have hash in DataFile, so use size as proxy)
177
+ if pf.size == prev_size:
178
+ # Same file
179
+ result.append((ManifestEntryStatus.EXISTING, pf))
180
+ else:
181
+ # File changed: REMOVED (old) + ADDED (new)
182
+ # Create ParquetFile for old version
183
+ old_pf = ParquetFile(
184
+ uri=pf.uri, path=pf.path, size=prev_size, blob_id="", split=None
185
+ )
186
+ result.append((ManifestEntryStatus.DELETED, old_pf))
187
+ result.append((ManifestEntryStatus.ADDED, pf))
188
+
189
+ # Process removed files (in previous but not in current)
190
+ for uri, size in previous_files_map.items():
191
+ if uri not in current_files_map:
192
+ # File was removed - extract path from URI
193
+ # URI format: hf://datasets/repo@revision/path
194
+ path = uri.split("@", 1)[1].split("/", 1)[1] if "@" in uri else ""
195
+ removed_pf = ParquetFile(uri=uri, path=path, size=size, blob_id="", split=None)
196
+ result.append((ManifestEntryStatus.DELETED, removed_pf))
197
+
198
+ return result
199
+
200
+
201
+ def create_schema(arrow_schema: pa.Schema, include_split_column: bool = False) -> Schema:
202
+ """Convert PyArrow schema to Iceberg Schema.
203
+
204
+ Converts PyArrow schema to Iceberg Schema with globally unique field IDs
205
+ assigned to all fields (including nested structures).
206
+
207
+ Args:
208
+ arrow_schema: PyArrow schema to convert
209
+ include_split_column: If True, adds a 'split' column as the first field (default: False)
210
+
211
+ Returns:
212
+ Iceberg Schema with field IDs assigned
213
+ """
214
+ # Convert to schema without IDs, then assign fresh IDs
215
+ schema_without_ids = _pyarrow_to_schema_without_ids(arrow_schema)
216
+ schema = assign_fresh_schema_ids(schema_without_ids)
217
+
218
+ # Add split column as the first field if requested
219
+ if include_split_column:
220
+ # Create split field (will get ID 1 after reassignment)
221
+ # Note: Although the schema uses StringType, the actual Parquet data
222
+ # will use dictionary encoding (int8 indices) for compression efficiency
223
+ # The split column is optional since it doesn't exist in the source Parquet files,
224
+ # it's derived from partition metadata
225
+ split_field = NestedField(
226
+ field_id=-1, # Temporary ID, will be reassigned
227
+ name="split",
228
+ field_type=StringType(),
229
+ required=False,
230
+ )
231
+ # Prepend split field to existing fields
232
+ new_fields = [split_field] + list(schema.fields)
233
+
234
+ # Create new schema and reassign all field IDs globally
235
+ # This ensures field IDs are globally unique across nested structures
236
+ schema_with_split = Schema(*new_fields)
237
+ schema = assign_fresh_schema_ids(schema_with_split)
238
+
239
+ return schema
240
+
241
+
242
+ def create_partition_spec(schema: Schema, include_split_column: bool = False) -> PartitionSpec:
243
+ """Build a partition spec with optional split partitioning.
244
+
245
+ Creates an identity partition on the split field when requested.
246
+ When False, returns an unpartitioned spec.
247
+
248
+ Args:
249
+ schema: Iceberg schema
250
+ include_split_column: Whether to partition by split field (default: False)
251
+
252
+ Returns:
253
+ PartitionSpec with split partition key if include_split_column is True,
254
+ or UNPARTITIONED_PARTITION_SPEC otherwise
255
+
256
+ Raises:
257
+ ValueError: If include_split_column is True but schema doesn't contain a 'split' field
258
+ """
259
+ if not include_split_column:
260
+ return UNPARTITIONED_PARTITION_SPEC
261
+
262
+ split_field = schema.find_field("split")
263
+ if split_field is None:
264
+ raise ValueError("Schema must contain a 'split' field to create split partition spec")
265
+
266
+ return PartitionSpec(
267
+ PartitionField(
268
+ source_id=split_field.field_id,
269
+ field_id=1000, # Partition field IDs start at 1000
270
+ transform=IdentityTransform(),
271
+ name="split",
272
+ ),
273
+ spec_id=0,
274
+ )
275
+
276
+
277
+ # TODO(kszucs): copied from pyiceberg.io.pyarrow with modifications to resolve list
278
+ # field mapping issues, remove once fixed in pyiceberg
279
+ def parquet_path_to_id_mapping(schema: Schema) -> dict[str, int]:
280
+ """Build a field mapping that handles both 'element' and 'item' list conventions.
281
+
282
+ Creates mappings for both PyArrow-compliant ('element') and actual Parquet
283
+ schema paths. This handles cases where Parquet files use 'item' (Arrow convention)
284
+ instead of 'element' (Parquet spec).
285
+ """
286
+ # Start with standard iceberg mapping (uses 'element')
287
+ base_mapping = _parquet_path_to_id_mapping(schema)
288
+
289
+ # Create alternative mappings by replacing 'element' with 'item'
290
+ flexible_mapping = dict(base_mapping)
291
+ for path, field_id in base_mapping.items():
292
+ if ".list.element" in path:
293
+ # Add mapping with 'item' convention
294
+ alt_path = path.replace(".list.element", ".list.item")
295
+ flexible_mapping[alt_path] = field_id
296
+
297
+ return flexible_mapping
298
+
299
+
300
+ # TODO(kszucs): copied from pyiceberg.io.pyarrow with modifications to resolve list
301
+ # field mapping issues, remove once fixed in pyiceberg
302
+ def parquet_file_to_data_file(
303
+ io: FileIO,
304
+ table_metadata: "TableMetadataV2",
305
+ parquet_file: ParquetFile,
306
+ include_split_column: bool = True,
307
+ ) -> DataFile:
308
+ """Convert ParquetFile to DataFile using flexible field mapping.
309
+
310
+ This implementation builds a flexible field mapping that supports both
311
+ 'element' (Parquet spec) and 'item' (Arrow convention) for list fields,
312
+ handling Parquet files written by both spec-compliant and non-compliant writers.
313
+
314
+ Args:
315
+ io: FileIO for reading parquet files
316
+ table_metadata: Table metadata containing schema and partition spec
317
+ parquet_file: ParquetFile with uri, size, and optional split metadata
318
+ include_split_column: If True, includes split from ParquetFile in partition
319
+
320
+ Returns:
321
+ DataFile with appropriate partition values
322
+ """
323
+ input_file = io.new_input(parquet_file.uri)
324
+ with input_file.open() as f:
325
+ parquet_metadata = pq.read_metadata(f)
326
+
327
+ schema = table_metadata.schema()
328
+ spec = table_metadata.spec()
329
+
330
+ # Use flexible mapping that handles both 'element' and 'item'
331
+ statistics = data_file_statistics_from_parquet_metadata(
332
+ parquet_metadata=parquet_metadata,
333
+ stats_columns=compute_statistics_plan(schema, table_metadata.properties),
334
+ parquet_column_mapping=parquet_path_to_id_mapping(schema),
335
+ )
336
+
337
+ # Get partition from statistics (handles columns present in parquet file)
338
+ partition = statistics.partition(spec, schema)
339
+ # Add split to partition if requested and we have split metadata
340
+ # The split is not in the parquet file itself, it's metadata we know about the file
341
+ if include_split_column:
342
+ for i, field in enumerate(spec.fields):
343
+ if field.name == "split":
344
+ partition[i] = parquet_file.split
345
+
346
+ return DataFile.from_args(
347
+ content=DataFileContent.DATA,
348
+ file_path=parquet_file.uri,
349
+ file_format="PARQUET",
350
+ partition=partition,
351
+ file_size_in_bytes=parquet_file.size,
352
+ sort_order_id=None,
353
+ spec_id=table_metadata.default_spec_id,
354
+ equality_ids=None,
355
+ key_metadata=None,
356
+ **statistics.to_serialized_dict(),
357
+ )
358
+
359
+
360
+ # TODO(kszucs): allow parallel calls to parquet_file_to_data_file
361
+ def write_manifest(
362
+ files: List[Tuple[ManifestEntryStatus, ParquetFile]],
363
+ metadata: TableMetadataV2,
364
+ schema: Schema,
365
+ spec: PartitionSpec,
366
+ snapshot_id: int,
367
+ sequence_number: int,
368
+ io: FileIO,
369
+ output_file,
370
+ uri: str,
371
+ include_split_column: bool = False,
372
+ ) -> Tuple[ManifestFile, List]:
373
+ """Create and write a manifest file.
374
+
375
+ Converts ParquetFile objects to DataFile objects and writes them
376
+ to a single manifest with their respective statuses.
377
+
378
+ Args:
379
+ files: List of (status, ParquetFile) tuples
380
+ metadata: Table metadata for reading parquet files
381
+ schema: Iceberg schema
382
+ spec: Partition specification
383
+ snapshot_id: Snapshot ID for the entries
384
+ sequence_number: Sequence number for the entries
385
+ io: FileIO instance for reading files
386
+ output_file: OutputFile to write to
387
+ uri: URI path to use in the returned ManifestFile
388
+ include_split_column: If True, includes split from ParquetFile in partition
389
+
390
+ Returns:
391
+ Tuple of (ManifestFile object, List of ManifestEntry objects)
392
+ """
393
+ entries = []
394
+ with _write_manifest(
395
+ format_version=2,
396
+ spec=spec,
397
+ schema=schema,
398
+ output_file=output_file,
399
+ snapshot_id=snapshot_id,
400
+ avro_compression="deflate",
401
+ ) as writer:
402
+ for status, parquet_file in files:
403
+ # Convert ParquetFile to DataFile
404
+ data_file = parquet_file_to_data_file(
405
+ io=io,
406
+ table_metadata=metadata,
407
+ parquet_file=parquet_file,
408
+ include_split_column=include_split_column,
409
+ )
410
+
411
+ # Create manifest entry with the appropriate status
412
+ entry = ManifestEntry.from_args(
413
+ status=status,
414
+ snapshot_id=snapshot_id,
415
+ sequence_number=sequence_number,
416
+ file_sequence_number=sequence_number,
417
+ data_file=data_file,
418
+ )
419
+ writer.add_entry(entry)
420
+ entries.append(entry)
421
+ manifest = writer.to_manifest_file()
422
+
423
+ manifest_file = ManifestFile.from_args(
424
+ manifest_path=uri,
425
+ manifest_length=manifest.manifest_length,
426
+ partition_spec_id=manifest.partition_spec_id,
427
+ content=manifest.content,
428
+ sequence_number=manifest.sequence_number,
429
+ min_sequence_number=manifest.min_sequence_number,
430
+ added_snapshot_id=manifest.added_snapshot_id,
431
+ added_files_count=manifest.added_files_count,
432
+ existing_files_count=manifest.existing_files_count,
433
+ deleted_files_count=manifest.deleted_files_count,
434
+ added_rows_count=manifest.added_rows_count,
435
+ existing_rows_count=manifest.existing_rows_count,
436
+ deleted_rows_count=manifest.deleted_rows_count,
437
+ partitions=manifest.partitions,
438
+ key_metadata=manifest.key_metadata,
439
+ )
440
+
441
+ return manifest_file, entries
442
+
443
+
444
+ def create_snapshot(
445
+ manifest_entries: List,
446
+ manifest_list_path: str,
447
+ snapshot_id: int,
448
+ parent_snapshot_id: Optional[int],
449
+ sequence_number: int,
450
+ schema_id: int,
451
+ spec: PartitionSpec,
452
+ schema: Schema,
453
+ previous_summary: Optional[Summary] = None,
454
+ ) -> Snapshot:
455
+ """Create Snapshot object with proper summary.
456
+
457
+ Uses SnapshotSummaryCollector and update_snapshot_summaries() to
458
+ compute accurate statistics from the provided manifest entries.
459
+
460
+ Args:
461
+ manifest_entries: List of ManifestEntry objects. Must be provided to avoid
462
+ file I/O issues with staging directories. The entries should be collected
463
+ during manifest creation.
464
+ manifest_list_path: Path to the manifest list
465
+ snapshot_id: Snapshot ID
466
+ parent_snapshot_id: Parent snapshot ID
467
+ sequence_number: Sequence number
468
+ schema_id: Schema ID
469
+ spec: Partition specification
470
+ schema: Iceberg schema
471
+ previous_summary: Summary from previous snapshot (for totals)
472
+
473
+ Returns:
474
+ Snapshot object
475
+ """
476
+ # Build summary by processing manifest entries in a single pass
477
+ ssc = SnapshotSummaryCollector(partition_summary_limit=0)
478
+ has_added = False
479
+ has_removed = False
480
+
481
+ for entry in manifest_entries:
482
+ if entry.status == ManifestEntryStatus.ADDED:
483
+ ssc.add_file(entry.data_file, schema=schema, partition_spec=spec)
484
+ has_added = True
485
+ elif entry.status == ManifestEntryStatus.DELETED:
486
+ ssc.remove_file(entry.data_file, schema=schema, partition_spec=spec)
487
+ has_removed = True
488
+
489
+ # Determine operation type
490
+ if has_removed and has_added:
491
+ operation = Operation.OVERWRITE
492
+ elif has_removed:
493
+ operation = Operation.DELETE
494
+ else:
495
+ operation = Operation.APPEND
496
+
497
+ summary = Summary(operation=operation, **ssc.build())
498
+ summary = update_snapshot_summaries(summary, previous_summary)
499
+
500
+ return Snapshot(
501
+ snapshot_id=snapshot_id,
502
+ parent_snapshot_id=parent_snapshot_id,
503
+ sequence_number=sequence_number,
504
+ timestamp_ms=int(uuid.uuid4().time_low), # Use UUID time component
505
+ manifest_list=manifest_list_path,
506
+ summary=summary,
507
+ schema_id=schema_id,
508
+ )
509
+
510
+
511
+ def write_snapshot(
512
+ files: List[ParquetFile],
513
+ schema: pa.Schema,
514
+ current_metadata: Optional[TableMetadataV2],
515
+ output_dir: Path,
516
+ base_uri: str,
517
+ properties: Optional[Dict[str, str]] = None,
518
+ include_split_column: bool = True,
519
+ io: Optional[FileIO] = None,
520
+ ) -> TableMetadataV2:
521
+ """Write new snapshot metadata.
522
+
523
+ This is the main entry point for creating Iceberg metadata. Compares the
524
+ provided files against the previous snapshot to determine operations:
525
+ - APPEND: only added files
526
+ - DELETE: only removed files
527
+ - OVERWRITE: both added and removed files
528
+
529
+ Args:
530
+ files: Complete list of ParquetFile objects for this snapshot
531
+ schema: PyArrow schema
532
+ current_metadata: Existing metadata (None for initial snapshot)
533
+ output_dir: Directory to write metadata files
534
+ base_uri: Base URI for paths in metadata
535
+ properties: Table properties
536
+ include_split_column: Whether to include a 'split' column in the schema and partition
537
+ by it. When True, adds a split column to the schema and partitions by split.
538
+ When False, uses unpartitioned spec (default: True)
539
+ io: Optional FileIO instance (default: PyArrowFileIO)
540
+
541
+ Returns:
542
+ Updated TableMetadataV2
543
+ """
544
+ properties = properties or {}
545
+ io = io or PyArrowFileIO()
546
+
547
+ # Ensure metadata directory exists
548
+ metadata_dir = output_dir / "metadata"
549
+ metadata_dir.mkdir(parents=True, exist_ok=True)
550
+
551
+ # Set up context based on whether this is initial or subsequent snapshot
552
+ if current_metadata is None:
553
+ table_uuid = uuid.UUID(str(uuid.uuid4()))
554
+ snapshot_id = 1
555
+ sequence_number = INITIAL_SEQUENCE_NUMBER
556
+ parent_snapshot_id = None
557
+ previous_summary = None
558
+
559
+ # Convert schema with optional split column
560
+ iceberg_schema = create_schema(schema, include_split_column=include_split_column)
561
+ merged_properties = {
562
+ **properties,
563
+ TableProperties.DEFAULT_NAME_MAPPING: iceberg_schema.name_mapping.model_dump_json(),
564
+ }
565
+
566
+ # Create partition spec (partition by split if split column is included)
567
+ spec = create_partition_spec(iceberg_schema, include_split_column=include_split_column)
568
+
569
+ # Create preliminary metadata for reading parquet files
570
+ file_metadata = new_table_metadata(
571
+ schema=iceberg_schema,
572
+ partition_spec=spec,
573
+ sort_order=UNSORTED_SORT_ORDER,
574
+ location=base_uri,
575
+ properties=merged_properties,
576
+ table_uuid=table_uuid,
577
+ )
578
+ else:
579
+ table_uuid = current_metadata.table_uuid
580
+ snapshot_id = max(s.snapshot_id for s in current_metadata.snapshots) + 1
581
+ sequence_number = current_metadata.last_sequence_number + 1
582
+ parent_snapshot_id = current_metadata.current_snapshot_id
583
+
584
+ previous_snapshot = current_metadata.snapshot_by_id(parent_snapshot_id)
585
+ previous_summary = previous_snapshot.summary if previous_snapshot else None
586
+
587
+ iceberg_schema = current_metadata.schema()
588
+ file_metadata = current_metadata
589
+ spec = current_metadata.spec()
590
+
591
+ merged_properties = {**current_metadata.properties}
592
+ if properties:
593
+ merged_properties.update(properties)
594
+
595
+ # Diff the provided files against previous snapshot
596
+ diff_results = diff_snapshot(files, current_metadata, io)
597
+
598
+ # Create single manifest with all entries (mixed statuses)
599
+ manifest_filename = f"{uuid.uuid4()}.avro"
600
+ manifest_path = metadata_dir / manifest_filename
601
+ manifest_uri = f"{base_uri}/metadata/{manifest_filename}"
602
+
603
+ output_file = io.new_output(str(manifest_path))
604
+ # Write manifest with final URI and get entries
605
+ manifest, manifest_entries = write_manifest(
606
+ diff_results,
607
+ file_metadata,
608
+ iceberg_schema,
609
+ spec,
610
+ snapshot_id,
611
+ sequence_number,
612
+ io,
613
+ output_file,
614
+ manifest_uri,
615
+ include_split_column=include_split_column,
616
+ )
617
+
618
+ # Create manifest list
619
+ manifest_list_filename = f"snap-{snapshot_id}-{sequence_number}-{uuid.uuid4()}.avro"
620
+ manifest_list_path = metadata_dir / manifest_list_filename
621
+ manifest_list_uri = f"{base_uri}/metadata/{manifest_list_filename}"
622
+
623
+ manifest_list_output = io.new_output(str(manifest_list_path))
624
+ with write_manifest_list(
625
+ format_version=2,
626
+ output_file=manifest_list_output,
627
+ snapshot_id=snapshot_id,
628
+ parent_snapshot_id=parent_snapshot_id,
629
+ sequence_number=sequence_number,
630
+ avro_compression="deflate",
631
+ ) as writer:
632
+ writer.add_manifests([manifest])
633
+
634
+ # Create snapshot using the collected manifest entries (avoids reading from file)
635
+ snapshot = create_snapshot(
636
+ manifest_entries,
637
+ manifest_list_uri,
638
+ snapshot_id,
639
+ parent_snapshot_id,
640
+ sequence_number,
641
+ iceberg_schema.schema_id,
642
+ spec,
643
+ iceberg_schema,
644
+ previous_summary=previous_summary,
645
+ )
646
+
647
+ # Create table metadata
648
+ if current_metadata is None:
649
+ metadata = TableMetadataV2(
650
+ location=base_uri,
651
+ table_uuid=table_uuid,
652
+ last_updated_ms=snapshot.timestamp_ms,
653
+ last_column_id=iceberg_schema.highest_field_id,
654
+ schemas=[iceberg_schema],
655
+ current_schema_id=iceberg_schema.schema_id,
656
+ partition_specs=[spec],
657
+ default_spec_id=spec.spec_id,
658
+ last_partition_id=spec.last_assigned_field_id,
659
+ properties=merged_properties,
660
+ current_snapshot_id=snapshot.snapshot_id,
661
+ snapshots=[snapshot],
662
+ snapshot_log=[],
663
+ metadata_log=[],
664
+ sort_orders=[UNSORTED_SORT_ORDER],
665
+ default_sort_order_id=UNSORTED_SORT_ORDER.order_id,
666
+ refs={
667
+ "main": SnapshotRef(snapshot_id=snapshot.snapshot_id, type=SnapshotRefType.BRANCH)
668
+ },
669
+ format_version=2,
670
+ last_sequence_number=sequence_number,
671
+ )
672
+ else:
673
+ metadata = TableMetadataV2(
674
+ location=current_metadata.location,
675
+ table_uuid=table_uuid,
676
+ last_updated_ms=snapshot.timestamp_ms,
677
+ last_column_id=current_metadata.last_column_id,
678
+ schemas=current_metadata.schemas,
679
+ current_schema_id=current_metadata.current_schema_id,
680
+ partition_specs=current_metadata.partition_specs,
681
+ default_spec_id=current_metadata.default_spec_id,
682
+ last_partition_id=current_metadata.last_partition_id,
683
+ properties=merged_properties,
684
+ current_snapshot_id=snapshot.snapshot_id,
685
+ snapshots=list(current_metadata.snapshots) + [snapshot],
686
+ snapshot_log=current_metadata.snapshot_log,
687
+ metadata_log=current_metadata.metadata_log,
688
+ sort_orders=current_metadata.sort_orders,
689
+ default_sort_order_id=current_metadata.default_sort_order_id,
690
+ refs={
691
+ "main": SnapshotRef(snapshot_id=snapshot.snapshot_id, type=SnapshotRefType.BRANCH)
692
+ },
693
+ format_version=2,
694
+ last_sequence_number=sequence_number,
695
+ )
696
+
697
+ # Write metadata file and version hint
698
+ version = len(metadata.snapshots)
699
+ metadata_file = metadata_dir / f"v{version}.metadata.json"
700
+ with open(metadata_file, "w") as f:
701
+ f.write(metadata.model_dump_json(indent=2))
702
+
703
+ version_hint_file = metadata_dir / "version-hint.text"
704
+ with open(version_hint_file, "w") as f:
705
+ f.write(str(version))
706
+
707
+ return metadata