faceberg 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faceberg/_version.py +34 -0
- faceberg/catalog.py +92 -76
- faceberg/discover.py +181 -0
- faceberg/iceberg.py +707 -0
- faceberg/tests/test_catalog.py +1 -2
- faceberg/tests/test_discover.py +257 -0
- faceberg/tests/test_iceberg.py +911 -0
- faceberg-0.1.2.dist-info/METADATA +149 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/RECORD +12 -11
- faceberg/bridge.py +0 -586
- faceberg/convert.py +0 -813
- faceberg/tests/test_bridge.py +0 -825
- faceberg/tests/test_convert.py +0 -422
- faceberg-0.1.0.dist-info/METADATA +0 -175
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/WHEEL +0 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/entry_points.txt +0 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/licenses/LICENSE +0 -0
faceberg/iceberg.py
ADDED
|
@@ -0,0 +1,707 @@
|
|
|
1
|
+
"""Iceberg metadata generation utilities.
|
|
2
|
+
|
|
3
|
+
This module provides functions for creating Apache Iceberg table metadata from
|
|
4
|
+
Parquet files. The main entry point is write_snapshot(), which takes a complete
|
|
5
|
+
list of files and generates all required Iceberg metadata (manifests, snapshots,
|
|
6
|
+
table metadata).
|
|
7
|
+
|
|
8
|
+
Data Flow
|
|
9
|
+
---------
|
|
10
|
+
1. User provides List[ParquetFile] representing desired snapshot state
|
|
11
|
+
2. diff_snapshot() compares against previous to determine changes
|
|
12
|
+
3. write_manifest() converts to Iceberg DataFiles and writes single manifest
|
|
13
|
+
4. create_snapshot() reads manifest to build snapshot with statistics
|
|
14
|
+
5. Table metadata written to JSON files
|
|
15
|
+
|
|
16
|
+
Key Concepts
|
|
17
|
+
------------
|
|
18
|
+
ParquetFile: Simple dataclass with uri, size, and hash fields
|
|
19
|
+
ManifestEntry: Iceberg entry with status (ADDED/EXISTING/DELETED) + DataFile
|
|
20
|
+
Snapshot: Point-in-time view of table with summary statistics
|
|
21
|
+
Operation: Type of snapshot (APPEND/DELETE/OVERWRITE) determined by entry statuses
|
|
22
|
+
|
|
23
|
+
Public API
|
|
24
|
+
----------
|
|
25
|
+
write_snapshot(): Main entry point for creating Iceberg metadata
|
|
26
|
+
create_schema(): Convert PyArrow schema to Iceberg schema with field IDs (optionally
|
|
27
|
+
with split column)
|
|
28
|
+
create_partition_spec(): Create a partition spec with optional split partitioning
|
|
29
|
+
ParquetFile: Dataclass representing a parquet file to include in snapshot
|
|
30
|
+
|
|
31
|
+
File Structure and Type Hierarchy
|
|
32
|
+
----------------------------------
|
|
33
|
+
|
|
34
|
+
Physical Files Created:
|
|
35
|
+
table/
|
|
36
|
+
└── metadata/
|
|
37
|
+
├── v1.metadata.json (TableMetadataV2)
|
|
38
|
+
├── v2.metadata.json (TableMetadataV2) - for subsequent snapshots
|
|
39
|
+
├── version-hint.text (current version number)
|
|
40
|
+
├── snap-1-0-<uuid>.avro (ManifestList)
|
|
41
|
+
├── snap-2-1-<uuid>.avro (ManifestList) - for subsequent snapshots
|
|
42
|
+
├── <uuid>.avro (Manifest file)
|
|
43
|
+
└── <uuid>.avro (Manifest file) - one per snapshot
|
|
44
|
+
|
|
45
|
+
Type Hierarchy:
|
|
46
|
+
TableMetadataV2 # Root metadata object
|
|
47
|
+
├── schemas: List[Schema] # Iceberg schema definitions
|
|
48
|
+
├── partition_specs: List[PartitionSpec]
|
|
49
|
+
├── snapshots: List[Snapshot] # All table snapshots
|
|
50
|
+
│ └── Snapshot
|
|
51
|
+
│ ├── snapshot_id: int
|
|
52
|
+
│ ├── manifest_list: str # → snap-X-Y-<uuid>.avro
|
|
53
|
+
│ └── summary: Summary # Operation stats + HF metadata
|
|
54
|
+
└── refs: Dict[str, SnapshotRef] # Branch references (e.g., "main")
|
|
55
|
+
|
|
56
|
+
ManifestList (snap-X-Y-<uuid>.avro) # Written to manifest_list path
|
|
57
|
+
└── manifests: List[ManifestFile] # References to manifest files
|
|
58
|
+
└── ManifestFile
|
|
59
|
+
├── manifest_path: str # → <uuid>.avro
|
|
60
|
+
├── added_files_count: int
|
|
61
|
+
├── added_rows_count: int
|
|
62
|
+
└── partition_spec_id: int
|
|
63
|
+
|
|
64
|
+
Manifest (<uuid>.avro) # Written to manifest_path
|
|
65
|
+
└── entries: List[ManifestEntry]
|
|
66
|
+
└── ManifestEntry
|
|
67
|
+
├── status: ManifestEntryStatus # ADDED/EXISTING/DELETED
|
|
68
|
+
├── snapshot_id: int
|
|
69
|
+
├── sequence_number: int
|
|
70
|
+
└── data_file: DataFile # ↓
|
|
71
|
+
|
|
72
|
+
DataFile # References actual data
|
|
73
|
+
├── file_path: str # → hf://datasets/org/repo@rev/file.parquet
|
|
74
|
+
├── file_format: FileFormat # PARQUET
|
|
75
|
+
├── partition: Dict[int, str] # {0: "train"} for split partitioning
|
|
76
|
+
├── record_count: int # Number of rows
|
|
77
|
+
├── file_size_in_bytes: int
|
|
78
|
+
└── file_sequence_number: int # Tracks when file was added
|
|
79
|
+
|
|
80
|
+
Note: DataFile objects reference external HuggingFace parquet files without
|
|
81
|
+
copying them. All metadata files use Iceberg's Avro format for manifests and
|
|
82
|
+
JSON for table metadata.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
import uuid
|
|
86
|
+
from pathlib import Path
|
|
87
|
+
from typing import Dict, List, Optional, Tuple
|
|
88
|
+
|
|
89
|
+
import pyarrow as pa
|
|
90
|
+
import pyarrow.parquet as pq
|
|
91
|
+
from pyiceberg.io import FileIO
|
|
92
|
+
from pyiceberg.io.pyarrow import (
|
|
93
|
+
PyArrowFileIO,
|
|
94
|
+
_pyarrow_to_schema_without_ids,
|
|
95
|
+
compute_statistics_plan,
|
|
96
|
+
data_file_statistics_from_parquet_metadata,
|
|
97
|
+
)
|
|
98
|
+
from pyiceberg.io.pyarrow import parquet_path_to_id_mapping as _parquet_path_to_id_mapping
|
|
99
|
+
from pyiceberg.manifest import (
|
|
100
|
+
DataFile,
|
|
101
|
+
DataFileContent,
|
|
102
|
+
ManifestEntry,
|
|
103
|
+
ManifestEntryStatus,
|
|
104
|
+
ManifestFile,
|
|
105
|
+
write_manifest_list,
|
|
106
|
+
)
|
|
107
|
+
from pyiceberg.manifest import write_manifest as _write_manifest
|
|
108
|
+
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
|
|
109
|
+
from pyiceberg.schema import Schema, assign_fresh_schema_ids
|
|
110
|
+
from pyiceberg.table import TableProperties
|
|
111
|
+
from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER, TableMetadataV2, new_table_metadata
|
|
112
|
+
from pyiceberg.table.refs import SnapshotRef, SnapshotRefType
|
|
113
|
+
from pyiceberg.table.snapshots import (
|
|
114
|
+
Operation,
|
|
115
|
+
Snapshot,
|
|
116
|
+
SnapshotSummaryCollector,
|
|
117
|
+
Summary,
|
|
118
|
+
update_snapshot_summaries,
|
|
119
|
+
)
|
|
120
|
+
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER
|
|
121
|
+
from pyiceberg.transforms import IdentityTransform
|
|
122
|
+
from pyiceberg.types import NestedField, StringType
|
|
123
|
+
|
|
124
|
+
from .discover import ParquetFile
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def diff_snapshot(
|
|
128
|
+
current_files: List[ParquetFile],
|
|
129
|
+
previous_metadata: Optional[TableMetadataV2],
|
|
130
|
+
io: FileIO,
|
|
131
|
+
) -> List[Tuple[ManifestEntryStatus, ParquetFile]]:
|
|
132
|
+
"""Diff current files against previous snapshot.
|
|
133
|
+
|
|
134
|
+
Compares files based on uri/size/hash to determine status:
|
|
135
|
+
- ADDED: File exists in current but not in previous
|
|
136
|
+
- EXISTING: File exists in both with same size and hash
|
|
137
|
+
- REMOVED: File exists in previous but not in current
|
|
138
|
+
- REMOVED + ADDED: File exists in both but size/hash changed
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
current_files: List of current ParquetFile objects
|
|
142
|
+
previous_metadata: Previous table metadata (None for initial snapshot)
|
|
143
|
+
io: FileIO for reading previous manifests
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
List of (status, ParquetFile) tuples
|
|
147
|
+
"""
|
|
148
|
+
# If no previous metadata, all files are ADDED
|
|
149
|
+
if previous_metadata is None:
|
|
150
|
+
return [(ManifestEntryStatus.ADDED, pf) for pf in current_files]
|
|
151
|
+
|
|
152
|
+
# Build map of previous files: uri -> size
|
|
153
|
+
previous_snapshot = previous_metadata.snapshot_by_id(previous_metadata.current_snapshot_id)
|
|
154
|
+
if previous_snapshot is None:
|
|
155
|
+
return [(ManifestEntryStatus.ADDED, pf) for pf in current_files]
|
|
156
|
+
|
|
157
|
+
# Read all files from previous snapshot
|
|
158
|
+
previous_files_map: Dict[str, int] = {}
|
|
159
|
+
for manifest in previous_snapshot.manifests(io):
|
|
160
|
+
for entry in manifest.fetch_manifest_entry(io=io, discard_deleted=True):
|
|
161
|
+
df = entry.data_file
|
|
162
|
+
previous_files_map[df.file_path] = df.file_size_in_bytes
|
|
163
|
+
|
|
164
|
+
# Build map of current files
|
|
165
|
+
current_files_map: Dict[str, ParquetFile] = {pf.uri: pf for pf in current_files}
|
|
166
|
+
|
|
167
|
+
result: List[Tuple[ManifestEntryStatus, ParquetFile]] = []
|
|
168
|
+
|
|
169
|
+
# Process current files
|
|
170
|
+
for pf in current_files:
|
|
171
|
+
if pf.uri not in previous_files_map:
|
|
172
|
+
# New file
|
|
173
|
+
result.append((ManifestEntryStatus.ADDED, pf))
|
|
174
|
+
else:
|
|
175
|
+
prev_size = previous_files_map[pf.uri]
|
|
176
|
+
# Check if size changed (we don't have hash in DataFile, so use size as proxy)
|
|
177
|
+
if pf.size == prev_size:
|
|
178
|
+
# Same file
|
|
179
|
+
result.append((ManifestEntryStatus.EXISTING, pf))
|
|
180
|
+
else:
|
|
181
|
+
# File changed: REMOVED (old) + ADDED (new)
|
|
182
|
+
# Create ParquetFile for old version
|
|
183
|
+
old_pf = ParquetFile(
|
|
184
|
+
uri=pf.uri, path=pf.path, size=prev_size, blob_id="", split=None
|
|
185
|
+
)
|
|
186
|
+
result.append((ManifestEntryStatus.DELETED, old_pf))
|
|
187
|
+
result.append((ManifestEntryStatus.ADDED, pf))
|
|
188
|
+
|
|
189
|
+
# Process removed files (in previous but not in current)
|
|
190
|
+
for uri, size in previous_files_map.items():
|
|
191
|
+
if uri not in current_files_map:
|
|
192
|
+
# File was removed - extract path from URI
|
|
193
|
+
# URI format: hf://datasets/repo@revision/path
|
|
194
|
+
path = uri.split("@", 1)[1].split("/", 1)[1] if "@" in uri else ""
|
|
195
|
+
removed_pf = ParquetFile(uri=uri, path=path, size=size, blob_id="", split=None)
|
|
196
|
+
result.append((ManifestEntryStatus.DELETED, removed_pf))
|
|
197
|
+
|
|
198
|
+
return result
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def create_schema(arrow_schema: pa.Schema, include_split_column: bool = False) -> Schema:
|
|
202
|
+
"""Convert PyArrow schema to Iceberg Schema.
|
|
203
|
+
|
|
204
|
+
Converts PyArrow schema to Iceberg Schema with globally unique field IDs
|
|
205
|
+
assigned to all fields (including nested structures).
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
arrow_schema: PyArrow schema to convert
|
|
209
|
+
include_split_column: If True, adds a 'split' column as the first field (default: False)
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Iceberg Schema with field IDs assigned
|
|
213
|
+
"""
|
|
214
|
+
# Convert to schema without IDs, then assign fresh IDs
|
|
215
|
+
schema_without_ids = _pyarrow_to_schema_without_ids(arrow_schema)
|
|
216
|
+
schema = assign_fresh_schema_ids(schema_without_ids)
|
|
217
|
+
|
|
218
|
+
# Add split column as the first field if requested
|
|
219
|
+
if include_split_column:
|
|
220
|
+
# Create split field (will get ID 1 after reassignment)
|
|
221
|
+
# Note: Although the schema uses StringType, the actual Parquet data
|
|
222
|
+
# will use dictionary encoding (int8 indices) for compression efficiency
|
|
223
|
+
# The split column is optional since it doesn't exist in the source Parquet files,
|
|
224
|
+
# it's derived from partition metadata
|
|
225
|
+
split_field = NestedField(
|
|
226
|
+
field_id=-1, # Temporary ID, will be reassigned
|
|
227
|
+
name="split",
|
|
228
|
+
field_type=StringType(),
|
|
229
|
+
required=False,
|
|
230
|
+
)
|
|
231
|
+
# Prepend split field to existing fields
|
|
232
|
+
new_fields = [split_field] + list(schema.fields)
|
|
233
|
+
|
|
234
|
+
# Create new schema and reassign all field IDs globally
|
|
235
|
+
# This ensures field IDs are globally unique across nested structures
|
|
236
|
+
schema_with_split = Schema(*new_fields)
|
|
237
|
+
schema = assign_fresh_schema_ids(schema_with_split)
|
|
238
|
+
|
|
239
|
+
return schema
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def create_partition_spec(schema: Schema, include_split_column: bool = False) -> PartitionSpec:
|
|
243
|
+
"""Build a partition spec with optional split partitioning.
|
|
244
|
+
|
|
245
|
+
Creates an identity partition on the split field when requested.
|
|
246
|
+
When False, returns an unpartitioned spec.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
schema: Iceberg schema
|
|
250
|
+
include_split_column: Whether to partition by split field (default: False)
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
PartitionSpec with split partition key if include_split_column is True,
|
|
254
|
+
or UNPARTITIONED_PARTITION_SPEC otherwise
|
|
255
|
+
|
|
256
|
+
Raises:
|
|
257
|
+
ValueError: If include_split_column is True but schema doesn't contain a 'split' field
|
|
258
|
+
"""
|
|
259
|
+
if not include_split_column:
|
|
260
|
+
return UNPARTITIONED_PARTITION_SPEC
|
|
261
|
+
|
|
262
|
+
split_field = schema.find_field("split")
|
|
263
|
+
if split_field is None:
|
|
264
|
+
raise ValueError("Schema must contain a 'split' field to create split partition spec")
|
|
265
|
+
|
|
266
|
+
return PartitionSpec(
|
|
267
|
+
PartitionField(
|
|
268
|
+
source_id=split_field.field_id,
|
|
269
|
+
field_id=1000, # Partition field IDs start at 1000
|
|
270
|
+
transform=IdentityTransform(),
|
|
271
|
+
name="split",
|
|
272
|
+
),
|
|
273
|
+
spec_id=0,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# TODO(kszucs): copied from pyiceberg.io.pyarrow with modifications to resolve list
|
|
278
|
+
# field mapping issues, remove once fixed in pyiceberg
|
|
279
|
+
def parquet_path_to_id_mapping(schema: Schema) -> dict[str, int]:
|
|
280
|
+
"""Build a field mapping that handles both 'element' and 'item' list conventions.
|
|
281
|
+
|
|
282
|
+
Creates mappings for both PyArrow-compliant ('element') and actual Parquet
|
|
283
|
+
schema paths. This handles cases where Parquet files use 'item' (Arrow convention)
|
|
284
|
+
instead of 'element' (Parquet spec).
|
|
285
|
+
"""
|
|
286
|
+
# Start with standard iceberg mapping (uses 'element')
|
|
287
|
+
base_mapping = _parquet_path_to_id_mapping(schema)
|
|
288
|
+
|
|
289
|
+
# Create alternative mappings by replacing 'element' with 'item'
|
|
290
|
+
flexible_mapping = dict(base_mapping)
|
|
291
|
+
for path, field_id in base_mapping.items():
|
|
292
|
+
if ".list.element" in path:
|
|
293
|
+
# Add mapping with 'item' convention
|
|
294
|
+
alt_path = path.replace(".list.element", ".list.item")
|
|
295
|
+
flexible_mapping[alt_path] = field_id
|
|
296
|
+
|
|
297
|
+
return flexible_mapping
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
# TODO(kszucs): copied from pyiceberg.io.pyarrow with modifications to resolve list
|
|
301
|
+
# field mapping issues, remove once fixed in pyiceberg
|
|
302
|
+
def parquet_file_to_data_file(
|
|
303
|
+
io: FileIO,
|
|
304
|
+
table_metadata: "TableMetadataV2",
|
|
305
|
+
parquet_file: ParquetFile,
|
|
306
|
+
include_split_column: bool = True,
|
|
307
|
+
) -> DataFile:
|
|
308
|
+
"""Convert ParquetFile to DataFile using flexible field mapping.
|
|
309
|
+
|
|
310
|
+
This implementation builds a flexible field mapping that supports both
|
|
311
|
+
'element' (Parquet spec) and 'item' (Arrow convention) for list fields,
|
|
312
|
+
handling Parquet files written by both spec-compliant and non-compliant writers.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
io: FileIO for reading parquet files
|
|
316
|
+
table_metadata: Table metadata containing schema and partition spec
|
|
317
|
+
parquet_file: ParquetFile with uri, size, and optional split metadata
|
|
318
|
+
include_split_column: If True, includes split from ParquetFile in partition
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
DataFile with appropriate partition values
|
|
322
|
+
"""
|
|
323
|
+
input_file = io.new_input(parquet_file.uri)
|
|
324
|
+
with input_file.open() as f:
|
|
325
|
+
parquet_metadata = pq.read_metadata(f)
|
|
326
|
+
|
|
327
|
+
schema = table_metadata.schema()
|
|
328
|
+
spec = table_metadata.spec()
|
|
329
|
+
|
|
330
|
+
# Use flexible mapping that handles both 'element' and 'item'
|
|
331
|
+
statistics = data_file_statistics_from_parquet_metadata(
|
|
332
|
+
parquet_metadata=parquet_metadata,
|
|
333
|
+
stats_columns=compute_statistics_plan(schema, table_metadata.properties),
|
|
334
|
+
parquet_column_mapping=parquet_path_to_id_mapping(schema),
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# Get partition from statistics (handles columns present in parquet file)
|
|
338
|
+
partition = statistics.partition(spec, schema)
|
|
339
|
+
# Add split to partition if requested and we have split metadata
|
|
340
|
+
# The split is not in the parquet file itself, it's metadata we know about the file
|
|
341
|
+
if include_split_column:
|
|
342
|
+
for i, field in enumerate(spec.fields):
|
|
343
|
+
if field.name == "split":
|
|
344
|
+
partition[i] = parquet_file.split
|
|
345
|
+
|
|
346
|
+
return DataFile.from_args(
|
|
347
|
+
content=DataFileContent.DATA,
|
|
348
|
+
file_path=parquet_file.uri,
|
|
349
|
+
file_format="PARQUET",
|
|
350
|
+
partition=partition,
|
|
351
|
+
file_size_in_bytes=parquet_file.size,
|
|
352
|
+
sort_order_id=None,
|
|
353
|
+
spec_id=table_metadata.default_spec_id,
|
|
354
|
+
equality_ids=None,
|
|
355
|
+
key_metadata=None,
|
|
356
|
+
**statistics.to_serialized_dict(),
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
# TODO(kszucs): allow parallel calls to parquet_file_to_data_file
|
|
361
|
+
def write_manifest(
|
|
362
|
+
files: List[Tuple[ManifestEntryStatus, ParquetFile]],
|
|
363
|
+
metadata: TableMetadataV2,
|
|
364
|
+
schema: Schema,
|
|
365
|
+
spec: PartitionSpec,
|
|
366
|
+
snapshot_id: int,
|
|
367
|
+
sequence_number: int,
|
|
368
|
+
io: FileIO,
|
|
369
|
+
output_file,
|
|
370
|
+
uri: str,
|
|
371
|
+
include_split_column: bool = False,
|
|
372
|
+
) -> Tuple[ManifestFile, List]:
|
|
373
|
+
"""Create and write a manifest file.
|
|
374
|
+
|
|
375
|
+
Converts ParquetFile objects to DataFile objects and writes them
|
|
376
|
+
to a single manifest with their respective statuses.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
files: List of (status, ParquetFile) tuples
|
|
380
|
+
metadata: Table metadata for reading parquet files
|
|
381
|
+
schema: Iceberg schema
|
|
382
|
+
spec: Partition specification
|
|
383
|
+
snapshot_id: Snapshot ID for the entries
|
|
384
|
+
sequence_number: Sequence number for the entries
|
|
385
|
+
io: FileIO instance for reading files
|
|
386
|
+
output_file: OutputFile to write to
|
|
387
|
+
uri: URI path to use in the returned ManifestFile
|
|
388
|
+
include_split_column: If True, includes split from ParquetFile in partition
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
Tuple of (ManifestFile object, List of ManifestEntry objects)
|
|
392
|
+
"""
|
|
393
|
+
entries = []
|
|
394
|
+
with _write_manifest(
|
|
395
|
+
format_version=2,
|
|
396
|
+
spec=spec,
|
|
397
|
+
schema=schema,
|
|
398
|
+
output_file=output_file,
|
|
399
|
+
snapshot_id=snapshot_id,
|
|
400
|
+
avro_compression="deflate",
|
|
401
|
+
) as writer:
|
|
402
|
+
for status, parquet_file in files:
|
|
403
|
+
# Convert ParquetFile to DataFile
|
|
404
|
+
data_file = parquet_file_to_data_file(
|
|
405
|
+
io=io,
|
|
406
|
+
table_metadata=metadata,
|
|
407
|
+
parquet_file=parquet_file,
|
|
408
|
+
include_split_column=include_split_column,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
# Create manifest entry with the appropriate status
|
|
412
|
+
entry = ManifestEntry.from_args(
|
|
413
|
+
status=status,
|
|
414
|
+
snapshot_id=snapshot_id,
|
|
415
|
+
sequence_number=sequence_number,
|
|
416
|
+
file_sequence_number=sequence_number,
|
|
417
|
+
data_file=data_file,
|
|
418
|
+
)
|
|
419
|
+
writer.add_entry(entry)
|
|
420
|
+
entries.append(entry)
|
|
421
|
+
manifest = writer.to_manifest_file()
|
|
422
|
+
|
|
423
|
+
manifest_file = ManifestFile.from_args(
|
|
424
|
+
manifest_path=uri,
|
|
425
|
+
manifest_length=manifest.manifest_length,
|
|
426
|
+
partition_spec_id=manifest.partition_spec_id,
|
|
427
|
+
content=manifest.content,
|
|
428
|
+
sequence_number=manifest.sequence_number,
|
|
429
|
+
min_sequence_number=manifest.min_sequence_number,
|
|
430
|
+
added_snapshot_id=manifest.added_snapshot_id,
|
|
431
|
+
added_files_count=manifest.added_files_count,
|
|
432
|
+
existing_files_count=manifest.existing_files_count,
|
|
433
|
+
deleted_files_count=manifest.deleted_files_count,
|
|
434
|
+
added_rows_count=manifest.added_rows_count,
|
|
435
|
+
existing_rows_count=manifest.existing_rows_count,
|
|
436
|
+
deleted_rows_count=manifest.deleted_rows_count,
|
|
437
|
+
partitions=manifest.partitions,
|
|
438
|
+
key_metadata=manifest.key_metadata,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
return manifest_file, entries
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def create_snapshot(
|
|
445
|
+
manifest_entries: List,
|
|
446
|
+
manifest_list_path: str,
|
|
447
|
+
snapshot_id: int,
|
|
448
|
+
parent_snapshot_id: Optional[int],
|
|
449
|
+
sequence_number: int,
|
|
450
|
+
schema_id: int,
|
|
451
|
+
spec: PartitionSpec,
|
|
452
|
+
schema: Schema,
|
|
453
|
+
previous_summary: Optional[Summary] = None,
|
|
454
|
+
) -> Snapshot:
|
|
455
|
+
"""Create Snapshot object with proper summary.
|
|
456
|
+
|
|
457
|
+
Uses SnapshotSummaryCollector and update_snapshot_summaries() to
|
|
458
|
+
compute accurate statistics from the provided manifest entries.
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
manifest_entries: List of ManifestEntry objects. Must be provided to avoid
|
|
462
|
+
file I/O issues with staging directories. The entries should be collected
|
|
463
|
+
during manifest creation.
|
|
464
|
+
manifest_list_path: Path to the manifest list
|
|
465
|
+
snapshot_id: Snapshot ID
|
|
466
|
+
parent_snapshot_id: Parent snapshot ID
|
|
467
|
+
sequence_number: Sequence number
|
|
468
|
+
schema_id: Schema ID
|
|
469
|
+
spec: Partition specification
|
|
470
|
+
schema: Iceberg schema
|
|
471
|
+
previous_summary: Summary from previous snapshot (for totals)
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
Snapshot object
|
|
475
|
+
"""
|
|
476
|
+
# Build summary by processing manifest entries in a single pass
|
|
477
|
+
ssc = SnapshotSummaryCollector(partition_summary_limit=0)
|
|
478
|
+
has_added = False
|
|
479
|
+
has_removed = False
|
|
480
|
+
|
|
481
|
+
for entry in manifest_entries:
|
|
482
|
+
if entry.status == ManifestEntryStatus.ADDED:
|
|
483
|
+
ssc.add_file(entry.data_file, schema=schema, partition_spec=spec)
|
|
484
|
+
has_added = True
|
|
485
|
+
elif entry.status == ManifestEntryStatus.DELETED:
|
|
486
|
+
ssc.remove_file(entry.data_file, schema=schema, partition_spec=spec)
|
|
487
|
+
has_removed = True
|
|
488
|
+
|
|
489
|
+
# Determine operation type
|
|
490
|
+
if has_removed and has_added:
|
|
491
|
+
operation = Operation.OVERWRITE
|
|
492
|
+
elif has_removed:
|
|
493
|
+
operation = Operation.DELETE
|
|
494
|
+
else:
|
|
495
|
+
operation = Operation.APPEND
|
|
496
|
+
|
|
497
|
+
summary = Summary(operation=operation, **ssc.build())
|
|
498
|
+
summary = update_snapshot_summaries(summary, previous_summary)
|
|
499
|
+
|
|
500
|
+
return Snapshot(
|
|
501
|
+
snapshot_id=snapshot_id,
|
|
502
|
+
parent_snapshot_id=parent_snapshot_id,
|
|
503
|
+
sequence_number=sequence_number,
|
|
504
|
+
timestamp_ms=int(uuid.uuid4().time_low), # Use UUID time component
|
|
505
|
+
manifest_list=manifest_list_path,
|
|
506
|
+
summary=summary,
|
|
507
|
+
schema_id=schema_id,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def write_snapshot(
|
|
512
|
+
files: List[ParquetFile],
|
|
513
|
+
schema: pa.Schema,
|
|
514
|
+
current_metadata: Optional[TableMetadataV2],
|
|
515
|
+
output_dir: Path,
|
|
516
|
+
base_uri: str,
|
|
517
|
+
properties: Optional[Dict[str, str]] = None,
|
|
518
|
+
include_split_column: bool = True,
|
|
519
|
+
io: Optional[FileIO] = None,
|
|
520
|
+
) -> TableMetadataV2:
|
|
521
|
+
"""Write new snapshot metadata.
|
|
522
|
+
|
|
523
|
+
This is the main entry point for creating Iceberg metadata. Compares the
|
|
524
|
+
provided files against the previous snapshot to determine operations:
|
|
525
|
+
- APPEND: only added files
|
|
526
|
+
- DELETE: only removed files
|
|
527
|
+
- OVERWRITE: both added and removed files
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
files: Complete list of ParquetFile objects for this snapshot
|
|
531
|
+
schema: PyArrow schema
|
|
532
|
+
current_metadata: Existing metadata (None for initial snapshot)
|
|
533
|
+
output_dir: Directory to write metadata files
|
|
534
|
+
base_uri: Base URI for paths in metadata
|
|
535
|
+
properties: Table properties
|
|
536
|
+
include_split_column: Whether to include a 'split' column in the schema and partition
|
|
537
|
+
by it. When True, adds a split column to the schema and partitions by split.
|
|
538
|
+
When False, uses unpartitioned spec (default: True)
|
|
539
|
+
io: Optional FileIO instance (default: PyArrowFileIO)
|
|
540
|
+
|
|
541
|
+
Returns:
|
|
542
|
+
Updated TableMetadataV2
|
|
543
|
+
"""
|
|
544
|
+
properties = properties or {}
|
|
545
|
+
io = io or PyArrowFileIO()
|
|
546
|
+
|
|
547
|
+
# Ensure metadata directory exists
|
|
548
|
+
metadata_dir = output_dir / "metadata"
|
|
549
|
+
metadata_dir.mkdir(parents=True, exist_ok=True)
|
|
550
|
+
|
|
551
|
+
# Set up context based on whether this is initial or subsequent snapshot
|
|
552
|
+
if current_metadata is None:
|
|
553
|
+
table_uuid = uuid.UUID(str(uuid.uuid4()))
|
|
554
|
+
snapshot_id = 1
|
|
555
|
+
sequence_number = INITIAL_SEQUENCE_NUMBER
|
|
556
|
+
parent_snapshot_id = None
|
|
557
|
+
previous_summary = None
|
|
558
|
+
|
|
559
|
+
# Convert schema with optional split column
|
|
560
|
+
iceberg_schema = create_schema(schema, include_split_column=include_split_column)
|
|
561
|
+
merged_properties = {
|
|
562
|
+
**properties,
|
|
563
|
+
TableProperties.DEFAULT_NAME_MAPPING: iceberg_schema.name_mapping.model_dump_json(),
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
# Create partition spec (partition by split if split column is included)
|
|
567
|
+
spec = create_partition_spec(iceberg_schema, include_split_column=include_split_column)
|
|
568
|
+
|
|
569
|
+
# Create preliminary metadata for reading parquet files
|
|
570
|
+
file_metadata = new_table_metadata(
|
|
571
|
+
schema=iceberg_schema,
|
|
572
|
+
partition_spec=spec,
|
|
573
|
+
sort_order=UNSORTED_SORT_ORDER,
|
|
574
|
+
location=base_uri,
|
|
575
|
+
properties=merged_properties,
|
|
576
|
+
table_uuid=table_uuid,
|
|
577
|
+
)
|
|
578
|
+
else:
|
|
579
|
+
table_uuid = current_metadata.table_uuid
|
|
580
|
+
snapshot_id = max(s.snapshot_id for s in current_metadata.snapshots) + 1
|
|
581
|
+
sequence_number = current_metadata.last_sequence_number + 1
|
|
582
|
+
parent_snapshot_id = current_metadata.current_snapshot_id
|
|
583
|
+
|
|
584
|
+
previous_snapshot = current_metadata.snapshot_by_id(parent_snapshot_id)
|
|
585
|
+
previous_summary = previous_snapshot.summary if previous_snapshot else None
|
|
586
|
+
|
|
587
|
+
iceberg_schema = current_metadata.schema()
|
|
588
|
+
file_metadata = current_metadata
|
|
589
|
+
spec = current_metadata.spec()
|
|
590
|
+
|
|
591
|
+
merged_properties = {**current_metadata.properties}
|
|
592
|
+
if properties:
|
|
593
|
+
merged_properties.update(properties)
|
|
594
|
+
|
|
595
|
+
# Diff the provided files against previous snapshot
|
|
596
|
+
diff_results = diff_snapshot(files, current_metadata, io)
|
|
597
|
+
|
|
598
|
+
# Create single manifest with all entries (mixed statuses)
|
|
599
|
+
manifest_filename = f"{uuid.uuid4()}.avro"
|
|
600
|
+
manifest_path = metadata_dir / manifest_filename
|
|
601
|
+
manifest_uri = f"{base_uri}/metadata/{manifest_filename}"
|
|
602
|
+
|
|
603
|
+
output_file = io.new_output(str(manifest_path))
|
|
604
|
+
# Write manifest with final URI and get entries
|
|
605
|
+
manifest, manifest_entries = write_manifest(
|
|
606
|
+
diff_results,
|
|
607
|
+
file_metadata,
|
|
608
|
+
iceberg_schema,
|
|
609
|
+
spec,
|
|
610
|
+
snapshot_id,
|
|
611
|
+
sequence_number,
|
|
612
|
+
io,
|
|
613
|
+
output_file,
|
|
614
|
+
manifest_uri,
|
|
615
|
+
include_split_column=include_split_column,
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
# Create manifest list
|
|
619
|
+
manifest_list_filename = f"snap-{snapshot_id}-{sequence_number}-{uuid.uuid4()}.avro"
|
|
620
|
+
manifest_list_path = metadata_dir / manifest_list_filename
|
|
621
|
+
manifest_list_uri = f"{base_uri}/metadata/{manifest_list_filename}"
|
|
622
|
+
|
|
623
|
+
manifest_list_output = io.new_output(str(manifest_list_path))
|
|
624
|
+
with write_manifest_list(
|
|
625
|
+
format_version=2,
|
|
626
|
+
output_file=manifest_list_output,
|
|
627
|
+
snapshot_id=snapshot_id,
|
|
628
|
+
parent_snapshot_id=parent_snapshot_id,
|
|
629
|
+
sequence_number=sequence_number,
|
|
630
|
+
avro_compression="deflate",
|
|
631
|
+
) as writer:
|
|
632
|
+
writer.add_manifests([manifest])
|
|
633
|
+
|
|
634
|
+
# Create snapshot using the collected manifest entries (avoids reading from file)
|
|
635
|
+
snapshot = create_snapshot(
|
|
636
|
+
manifest_entries,
|
|
637
|
+
manifest_list_uri,
|
|
638
|
+
snapshot_id,
|
|
639
|
+
parent_snapshot_id,
|
|
640
|
+
sequence_number,
|
|
641
|
+
iceberg_schema.schema_id,
|
|
642
|
+
spec,
|
|
643
|
+
iceberg_schema,
|
|
644
|
+
previous_summary=previous_summary,
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
# Create table metadata
|
|
648
|
+
if current_metadata is None:
|
|
649
|
+
metadata = TableMetadataV2(
|
|
650
|
+
location=base_uri,
|
|
651
|
+
table_uuid=table_uuid,
|
|
652
|
+
last_updated_ms=snapshot.timestamp_ms,
|
|
653
|
+
last_column_id=iceberg_schema.highest_field_id,
|
|
654
|
+
schemas=[iceberg_schema],
|
|
655
|
+
current_schema_id=iceberg_schema.schema_id,
|
|
656
|
+
partition_specs=[spec],
|
|
657
|
+
default_spec_id=spec.spec_id,
|
|
658
|
+
last_partition_id=spec.last_assigned_field_id,
|
|
659
|
+
properties=merged_properties,
|
|
660
|
+
current_snapshot_id=snapshot.snapshot_id,
|
|
661
|
+
snapshots=[snapshot],
|
|
662
|
+
snapshot_log=[],
|
|
663
|
+
metadata_log=[],
|
|
664
|
+
sort_orders=[UNSORTED_SORT_ORDER],
|
|
665
|
+
default_sort_order_id=UNSORTED_SORT_ORDER.order_id,
|
|
666
|
+
refs={
|
|
667
|
+
"main": SnapshotRef(snapshot_id=snapshot.snapshot_id, type=SnapshotRefType.BRANCH)
|
|
668
|
+
},
|
|
669
|
+
format_version=2,
|
|
670
|
+
last_sequence_number=sequence_number,
|
|
671
|
+
)
|
|
672
|
+
else:
|
|
673
|
+
metadata = TableMetadataV2(
|
|
674
|
+
location=current_metadata.location,
|
|
675
|
+
table_uuid=table_uuid,
|
|
676
|
+
last_updated_ms=snapshot.timestamp_ms,
|
|
677
|
+
last_column_id=current_metadata.last_column_id,
|
|
678
|
+
schemas=current_metadata.schemas,
|
|
679
|
+
current_schema_id=current_metadata.current_schema_id,
|
|
680
|
+
partition_specs=current_metadata.partition_specs,
|
|
681
|
+
default_spec_id=current_metadata.default_spec_id,
|
|
682
|
+
last_partition_id=current_metadata.last_partition_id,
|
|
683
|
+
properties=merged_properties,
|
|
684
|
+
current_snapshot_id=snapshot.snapshot_id,
|
|
685
|
+
snapshots=list(current_metadata.snapshots) + [snapshot],
|
|
686
|
+
snapshot_log=current_metadata.snapshot_log,
|
|
687
|
+
metadata_log=current_metadata.metadata_log,
|
|
688
|
+
sort_orders=current_metadata.sort_orders,
|
|
689
|
+
default_sort_order_id=current_metadata.default_sort_order_id,
|
|
690
|
+
refs={
|
|
691
|
+
"main": SnapshotRef(snapshot_id=snapshot.snapshot_id, type=SnapshotRefType.BRANCH)
|
|
692
|
+
},
|
|
693
|
+
format_version=2,
|
|
694
|
+
last_sequence_number=sequence_number,
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
# Write metadata file and version hint
|
|
698
|
+
version = len(metadata.snapshots)
|
|
699
|
+
metadata_file = metadata_dir / f"v{version}.metadata.json"
|
|
700
|
+
with open(metadata_file, "w") as f:
|
|
701
|
+
f.write(metadata.model_dump_json(indent=2))
|
|
702
|
+
|
|
703
|
+
version_hint_file = metadata_dir / "version-hint.text"
|
|
704
|
+
with open(version_hint_file, "w") as f:
|
|
705
|
+
f.write(str(version))
|
|
706
|
+
|
|
707
|
+
return metadata
|