faceberg 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faceberg/_version.py +34 -0
- faceberg/catalog.py +92 -76
- faceberg/discover.py +181 -0
- faceberg/iceberg.py +707 -0
- faceberg/tests/test_catalog.py +1 -2
- faceberg/tests/test_discover.py +257 -0
- faceberg/tests/test_iceberg.py +911 -0
- faceberg-0.1.2.dist-info/METADATA +149 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/RECORD +12 -11
- faceberg/bridge.py +0 -586
- faceberg/convert.py +0 -813
- faceberg/tests/test_bridge.py +0 -825
- faceberg/tests/test_convert.py +0 -422
- faceberg-0.1.0.dist-info/METADATA +0 -175
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/WHEEL +0 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/entry_points.txt +0 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/licenses/LICENSE +0 -0
faceberg/convert.py
DELETED
|
@@ -1,813 +0,0 @@
|
|
|
1
|
-
"""Conversion from TableInfo to Iceberg metadata files.
|
|
2
|
-
|
|
3
|
-
This module takes TableInfo objects (created by the bridge layer) and converts them
|
|
4
|
-
into actual Iceberg metadata files in metadata-only mode, referencing the original
|
|
5
|
-
HuggingFace dataset files.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import logging
|
|
9
|
-
import time
|
|
10
|
-
import uuid
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
from typing import Callable, Dict, List, Optional
|
|
13
|
-
|
|
14
|
-
import pyarrow.parquet as pq
|
|
15
|
-
from huggingface_hub import get_hf_file_metadata, hf_hub_url
|
|
16
|
-
from pyiceberg.io.pyarrow import PyArrowFileIO
|
|
17
|
-
from pyiceberg.manifest import (
|
|
18
|
-
DataFile,
|
|
19
|
-
DataFileContent,
|
|
20
|
-
FileFormat,
|
|
21
|
-
ManifestEntry,
|
|
22
|
-
ManifestEntryStatus,
|
|
23
|
-
ManifestFile,
|
|
24
|
-
write_manifest,
|
|
25
|
-
write_manifest_list,
|
|
26
|
-
)
|
|
27
|
-
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
|
|
28
|
-
from pyiceberg.schema import Schema
|
|
29
|
-
from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER, TableMetadataV2, new_table_metadata
|
|
30
|
-
from pyiceberg.table.refs import SnapshotRef, SnapshotRefType
|
|
31
|
-
from pyiceberg.table.snapshots import Operation, Snapshot, Summary
|
|
32
|
-
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER
|
|
33
|
-
from pyiceberg.transforms import IdentityTransform
|
|
34
|
-
|
|
35
|
-
# Import FileInfo (created by bridge layer)
|
|
36
|
-
from .bridge import FileInfo
|
|
37
|
-
|
|
38
|
-
logger = logging.getLogger(__name__)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
# TODO(kszucs): parallelize metadata creation for large number of files
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class IcebergMetadataWriter:
|
|
45
|
-
"""Writes Iceberg metadata files in metadata-only mode.
|
|
46
|
-
|
|
47
|
-
This writer creates Iceberg metadata (manifest, manifest list, table metadata)
|
|
48
|
-
that references existing HuggingFace dataset files without copying or modifying them.
|
|
49
|
-
"""
|
|
50
|
-
|
|
51
|
-
def __init__(
|
|
52
|
-
self,
|
|
53
|
-
table_path: Path,
|
|
54
|
-
schema: Schema,
|
|
55
|
-
partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
|
|
56
|
-
base_uri: str = None,
|
|
57
|
-
):
|
|
58
|
-
"""Initialize metadata writer.
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
table_path: Local directory for physically writing files (staging directory)
|
|
62
|
-
schema: Iceberg schema
|
|
63
|
-
partition_spec: Partition specification
|
|
64
|
-
base_uri: Base URI for paths stored in metadata
|
|
65
|
-
(e.g., "file:///path/to/catalog" or "hf://datasets/org/repo")
|
|
66
|
-
"""
|
|
67
|
-
self.table_path = table_path
|
|
68
|
-
self.schema = schema
|
|
69
|
-
self.partition_spec = partition_spec
|
|
70
|
-
self.metadata_dir = table_path / "metadata"
|
|
71
|
-
self.metadata_dir.mkdir(parents=True, exist_ok=True)
|
|
72
|
-
self.file_io = PyArrowFileIO()
|
|
73
|
-
|
|
74
|
-
# Store base URI for metadata references
|
|
75
|
-
self.base_uri = base_uri.rstrip("/")
|
|
76
|
-
|
|
77
|
-
def create_metadata_from_files(
|
|
78
|
-
self,
|
|
79
|
-
file_infos: List[FileInfo],
|
|
80
|
-
table_uuid: str,
|
|
81
|
-
properties: Optional[Dict[str, str]] = None,
|
|
82
|
-
progress_callback: Optional[Callable] = None,
|
|
83
|
-
identifier: Optional[str] = None,
|
|
84
|
-
) -> Path:
|
|
85
|
-
"""Create Iceberg metadata from data file information.
|
|
86
|
-
|
|
87
|
-
This method creates all necessary Iceberg metadata files:
|
|
88
|
-
- Manifest file (.avro)
|
|
89
|
-
- Manifest list (.avro)
|
|
90
|
-
- Table metadata (v1.metadata.json)
|
|
91
|
-
- Version hint (version-hint.text)
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
file_infos: List of FileInfo objects describing data files
|
|
95
|
-
table_uuid: UUID for the table
|
|
96
|
-
properties: Optional table properties
|
|
97
|
-
progress_callback: Optional callback for progress updates
|
|
98
|
-
identifier: Optional table identifier for progress reporting
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
Path to the metadata file
|
|
102
|
-
"""
|
|
103
|
-
logger.info(f"Creating Iceberg metadata for {len(file_infos)} files")
|
|
104
|
-
|
|
105
|
-
# Step 1: Read file metadata from HuggingFace Hub
|
|
106
|
-
enriched_files = self._read_file_metadata(
|
|
107
|
-
file_infos, progress_callback=progress_callback, identifier=identifier
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
# Step 2: Create DataFile entries
|
|
111
|
-
data_files = self._create_data_files(enriched_files)
|
|
112
|
-
|
|
113
|
-
# Step 3: Write metadata files
|
|
114
|
-
return self._write_metadata_files(data_files, table_uuid, properties or {})
|
|
115
|
-
|
|
116
|
-
def _get_hf_file_size(self, file_path: str) -> int:
|
|
117
|
-
"""Get the actual file size from HuggingFace Hub.
|
|
118
|
-
|
|
119
|
-
This queries the HuggingFace API to get the exact file size. While we could
|
|
120
|
-
calculate an approximate size from Parquet metadata, the calculation is not
|
|
121
|
-
exact enough for DuckDB's iceberg_scan which needs precise file sizes.
|
|
122
|
-
|
|
123
|
-
Args:
|
|
124
|
-
file_path: HuggingFace file path in format hf://datasets/repo_id/path/to/file
|
|
125
|
-
|
|
126
|
-
Returns:
|
|
127
|
-
File size in bytes
|
|
128
|
-
|
|
129
|
-
Raises:
|
|
130
|
-
ValueError: If file path cannot be parsed or file size cannot be determined
|
|
131
|
-
"""
|
|
132
|
-
# Parse hf:// URL - format is hf://datasets/org/repo@revision/path/to/file
|
|
133
|
-
# or hf://datasets/org/repo/path/to/file (without revision)
|
|
134
|
-
if not file_path.startswith("hf://datasets/"):
|
|
135
|
-
raise ValueError(f"Invalid HuggingFace file path: {file_path}")
|
|
136
|
-
|
|
137
|
-
# Split into repo_id@revision (org/repo@revision) and filename (path/to/file)
|
|
138
|
-
remaining = file_path[len("hf://datasets/") :]
|
|
139
|
-
parts = remaining.split("/")
|
|
140
|
-
if len(parts) < 3:
|
|
141
|
-
raise ValueError(f"Invalid HuggingFace file path format: {file_path}")
|
|
142
|
-
|
|
143
|
-
# Handle repo_id with optional @revision
|
|
144
|
-
repo_part = f"{parts[0]}/{parts[1]}" # org/repo@revision or org/repo
|
|
145
|
-
if "@" in repo_part:
|
|
146
|
-
repo_id, revision = repo_part.split("@", 1)
|
|
147
|
-
else:
|
|
148
|
-
repo_id = repo_part
|
|
149
|
-
revision = None
|
|
150
|
-
|
|
151
|
-
filename = "/".join(parts[2:]) # path/to/file
|
|
152
|
-
url = hf_hub_url(repo_id=repo_id, filename=filename, repo_type="dataset", revision=revision)
|
|
153
|
-
metadata = get_hf_file_metadata(url)
|
|
154
|
-
return metadata.size
|
|
155
|
-
|
|
156
|
-
def _read_file_metadata(
|
|
157
|
-
self,
|
|
158
|
-
file_infos: List[FileInfo],
|
|
159
|
-
progress_callback: Optional[Callable] = None,
|
|
160
|
-
identifier: Optional[str] = None,
|
|
161
|
-
) -> List[FileInfo]:
|
|
162
|
-
"""Read metadata from HuggingFace Hub files without downloading.
|
|
163
|
-
|
|
164
|
-
Args:
|
|
165
|
-
file_infos: List of FileInfo objects (may have size/row_count = 0)
|
|
166
|
-
progress_callback: Optional callback for progress updates
|
|
167
|
-
identifier: Optional table identifier for progress reporting
|
|
168
|
-
|
|
169
|
-
Returns:
|
|
170
|
-
List of FileInfo objects with enriched metadata
|
|
171
|
-
|
|
172
|
-
Raises:
|
|
173
|
-
Exception: If metadata cannot be read from any file
|
|
174
|
-
"""
|
|
175
|
-
enriched = []
|
|
176
|
-
total_files = len(file_infos)
|
|
177
|
-
|
|
178
|
-
for i, file_info in enumerate(file_infos):
|
|
179
|
-
# Read metadata directly from HF Hub without downloading the file
|
|
180
|
-
metadata = pq.read_metadata(file_info.uri)
|
|
181
|
-
row_count = metadata.num_rows
|
|
182
|
-
|
|
183
|
-
# Use provided size if available, otherwise get from HuggingFace API
|
|
184
|
-
file_size = file_info.size_bytes
|
|
185
|
-
if not file_size:
|
|
186
|
-
# Get exact file size from HuggingFace Hub API
|
|
187
|
-
file_size = self._get_hf_file_size(file_info.uri)
|
|
188
|
-
|
|
189
|
-
enriched.append(
|
|
190
|
-
FileInfo(
|
|
191
|
-
uri=file_info.uri,
|
|
192
|
-
split=file_info.split,
|
|
193
|
-
size_bytes=file_size,
|
|
194
|
-
row_count=row_count,
|
|
195
|
-
)
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
# Report progress after processing each file
|
|
199
|
-
if progress_callback and identifier:
|
|
200
|
-
percent = 10 + int((i + 1) / total_files * 80)
|
|
201
|
-
progress_callback(identifier, state="in_progress", percent=percent)
|
|
202
|
-
|
|
203
|
-
return enriched
|
|
204
|
-
|
|
205
|
-
def _create_data_files(
|
|
206
|
-
self,
|
|
207
|
-
file_infos: List[FileInfo],
|
|
208
|
-
sequence_number: int = INITIAL_SEQUENCE_NUMBER,
|
|
209
|
-
previous_data_files: Optional[List[DataFile]] = None,
|
|
210
|
-
) -> List[DataFile]:
|
|
211
|
-
"""Create Iceberg DataFile entries from file information.
|
|
212
|
-
|
|
213
|
-
Args:
|
|
214
|
-
file_infos: List of FileInfo objects with metadata
|
|
215
|
-
sequence_number: Current sequence number (default: 0 for initial snapshot)
|
|
216
|
-
previous_data_files: Optional list of data files from previous snapshot for
|
|
217
|
-
inheritance tracking
|
|
218
|
-
|
|
219
|
-
Returns:
|
|
220
|
-
List of Iceberg DataFile objects
|
|
221
|
-
"""
|
|
222
|
-
# Build lookup of previous files by path for inheritance checking
|
|
223
|
-
previous_files_map = {}
|
|
224
|
-
if previous_data_files:
|
|
225
|
-
for prev_file in previous_data_files:
|
|
226
|
-
previous_files_map[prev_file.file_path] = prev_file
|
|
227
|
-
|
|
228
|
-
data_files = []
|
|
229
|
-
|
|
230
|
-
for file_info in file_infos:
|
|
231
|
-
# Build partition values based on the partition spec
|
|
232
|
-
# Partition dict maps from partition field position to the partition value
|
|
233
|
-
if self.partition_spec != UNPARTITIONED_PARTITION_SPEC and file_info.split:
|
|
234
|
-
# Use position 0 for the first (and only) partition field
|
|
235
|
-
# Convert split to string (it may be a NamedSplit object from HuggingFace)
|
|
236
|
-
partition = {0: str(file_info.split)}
|
|
237
|
-
else:
|
|
238
|
-
partition = {}
|
|
239
|
-
|
|
240
|
-
# Determine file_sequence_number: inherit from previous snapshot if file unchanged
|
|
241
|
-
prev_file = previous_files_map.get(file_info.uri)
|
|
242
|
-
if prev_file and self._files_identical(prev_file, file_info):
|
|
243
|
-
# File unchanged - inherit sequence number
|
|
244
|
-
file_seq_num = prev_file.file_sequence_number
|
|
245
|
-
else:
|
|
246
|
-
# File is new or modified - use current sequence number
|
|
247
|
-
file_seq_num = sequence_number
|
|
248
|
-
|
|
249
|
-
data_file = DataFile.from_args(
|
|
250
|
-
content=DataFileContent.DATA,
|
|
251
|
-
file_path=file_info.uri,
|
|
252
|
-
file_format=FileFormat.PARQUET,
|
|
253
|
-
partition=partition,
|
|
254
|
-
record_count=file_info.row_count,
|
|
255
|
-
file_size_in_bytes=file_info.size_bytes,
|
|
256
|
-
file_sequence_number=file_seq_num, # Track inheritance
|
|
257
|
-
column_sizes={},
|
|
258
|
-
value_counts={},
|
|
259
|
-
null_value_counts={},
|
|
260
|
-
nan_value_counts={},
|
|
261
|
-
lower_bounds={},
|
|
262
|
-
upper_bounds={},
|
|
263
|
-
key_metadata=None,
|
|
264
|
-
split_offsets=None,
|
|
265
|
-
equality_ids=None,
|
|
266
|
-
sort_order_id=None,
|
|
267
|
-
)
|
|
268
|
-
data_files.append(data_file)
|
|
269
|
-
|
|
270
|
-
return data_files
|
|
271
|
-
|
|
272
|
-
def _files_identical(self, prev_file: DataFile, current_file: FileInfo) -> bool:
|
|
273
|
-
"""Check if file is unchanged between snapshots.
|
|
274
|
-
|
|
275
|
-
Args:
|
|
276
|
-
prev_file: DataFile from previous snapshot
|
|
277
|
-
current_file: FileInfo for current file
|
|
278
|
-
|
|
279
|
-
Returns:
|
|
280
|
-
True if file is unchanged
|
|
281
|
-
"""
|
|
282
|
-
return (
|
|
283
|
-
prev_file.file_size_in_bytes == current_file.size_bytes
|
|
284
|
-
and prev_file.record_count == current_file.row_count
|
|
285
|
-
)
|
|
286
|
-
|
|
287
|
-
def _get_previous_manifests(self, metadata: TableMetadataV2) -> Optional[List[ManifestFile]]:
|
|
288
|
-
"""Extract manifest file references from the current snapshot without reading
|
|
289
|
-
their contents.
|
|
290
|
-
|
|
291
|
-
This method is used for fast append operations to reuse existing manifest files
|
|
292
|
-
without downloading and reading their contents. This significantly reduces
|
|
293
|
-
bandwidth and improves performance for remote catalogs.
|
|
294
|
-
|
|
295
|
-
Args:
|
|
296
|
-
metadata: Current table metadata
|
|
297
|
-
|
|
298
|
-
Returns:
|
|
299
|
-
List of ManifestFile objects from current snapshot, or None if no snapshots
|
|
300
|
-
"""
|
|
301
|
-
# Return None if there's no current snapshot
|
|
302
|
-
if not metadata.current_snapshot_id or not metadata.snapshots:
|
|
303
|
-
return None
|
|
304
|
-
|
|
305
|
-
# Find the current snapshot
|
|
306
|
-
current_snapshot = next(
|
|
307
|
-
(s for s in metadata.snapshots if s.snapshot_id == metadata.current_snapshot_id),
|
|
308
|
-
None,
|
|
309
|
-
)
|
|
310
|
-
if not current_snapshot:
|
|
311
|
-
return None
|
|
312
|
-
|
|
313
|
-
# Return manifest file references (without reading their contents)
|
|
314
|
-
return list(current_snapshot.manifests(self.file_io))
|
|
315
|
-
|
|
316
|
-
def _write_metadata_files(
|
|
317
|
-
self,
|
|
318
|
-
data_files: List[DataFile],
|
|
319
|
-
table_uuid: str,
|
|
320
|
-
properties: Dict[str, str],
|
|
321
|
-
) -> Path:
|
|
322
|
-
"""Write Iceberg table metadata, manifest, and manifest list.
|
|
323
|
-
|
|
324
|
-
Args:
|
|
325
|
-
data_files: List of data files
|
|
326
|
-
table_uuid: UUID for the table
|
|
327
|
-
properties: Table properties
|
|
328
|
-
|
|
329
|
-
Returns:
|
|
330
|
-
Path to the metadata file
|
|
331
|
-
"""
|
|
332
|
-
# Step 1: Create and write manifest
|
|
333
|
-
manifest_file = self._create_manifest(data_files)
|
|
334
|
-
|
|
335
|
-
# Step 2: Create snapshot (with properties for summary fields)
|
|
336
|
-
snapshot = self._create_snapshot(data_files, properties)
|
|
337
|
-
|
|
338
|
-
# Step 3: Write manifest list
|
|
339
|
-
self._write_manifest_list(snapshot, [manifest_file])
|
|
340
|
-
|
|
341
|
-
# Step 4: Create table metadata
|
|
342
|
-
metadata = self._create_table_metadata(snapshot, table_uuid, properties)
|
|
343
|
-
|
|
344
|
-
# Step 5: Write metadata file
|
|
345
|
-
return self._write_metadata_file(metadata)
|
|
346
|
-
|
|
347
|
-
def _create_manifest(self, data_files: List[DataFile]):
|
|
348
|
-
"""Create and write manifest file.
|
|
349
|
-
|
|
350
|
-
Args:
|
|
351
|
-
data_files: List of data files
|
|
352
|
-
|
|
353
|
-
Returns:
|
|
354
|
-
ManifestFile object
|
|
355
|
-
"""
|
|
356
|
-
manifest_filename = f"{uuid.uuid4()}.avro"
|
|
357
|
-
manifest_write_path = self.metadata_dir / manifest_filename
|
|
358
|
-
manifest_uri = f"{self.base_uri.rstrip('/')}/metadata/{manifest_filename}"
|
|
359
|
-
output_file = self.file_io.new_output(str(manifest_write_path))
|
|
360
|
-
|
|
361
|
-
with write_manifest(
|
|
362
|
-
format_version=2,
|
|
363
|
-
spec=self.partition_spec,
|
|
364
|
-
schema=self.schema,
|
|
365
|
-
output_file=output_file,
|
|
366
|
-
snapshot_id=1,
|
|
367
|
-
avro_compression="deflate",
|
|
368
|
-
) as writer:
|
|
369
|
-
for data_file in data_files:
|
|
370
|
-
entry = ManifestEntry.from_args(
|
|
371
|
-
status=ManifestEntryStatus.ADDED,
|
|
372
|
-
snapshot_id=1,
|
|
373
|
-
sequence_number=INITIAL_SEQUENCE_NUMBER,
|
|
374
|
-
file_sequence_number=INITIAL_SEQUENCE_NUMBER,
|
|
375
|
-
data_file=data_file,
|
|
376
|
-
)
|
|
377
|
-
writer.add_entry(entry)
|
|
378
|
-
|
|
379
|
-
original_manifest = writer.to_manifest_file()
|
|
380
|
-
|
|
381
|
-
# Create a new ManifestFile with URI path for metadata references
|
|
382
|
-
return ManifestFile.from_args(
|
|
383
|
-
manifest_path=manifest_uri,
|
|
384
|
-
manifest_length=original_manifest.manifest_length,
|
|
385
|
-
partition_spec_id=original_manifest.partition_spec_id,
|
|
386
|
-
content=original_manifest.content,
|
|
387
|
-
sequence_number=original_manifest.sequence_number,
|
|
388
|
-
min_sequence_number=original_manifest.min_sequence_number,
|
|
389
|
-
added_snapshot_id=original_manifest.added_snapshot_id,
|
|
390
|
-
added_files_count=original_manifest.added_files_count,
|
|
391
|
-
existing_files_count=original_manifest.existing_files_count,
|
|
392
|
-
deleted_files_count=original_manifest.deleted_files_count,
|
|
393
|
-
added_rows_count=original_manifest.added_rows_count,
|
|
394
|
-
existing_rows_count=original_manifest.existing_rows_count,
|
|
395
|
-
deleted_rows_count=original_manifest.deleted_rows_count,
|
|
396
|
-
partitions=original_manifest.partitions,
|
|
397
|
-
key_metadata=original_manifest.key_metadata,
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
def _create_snapshot(
|
|
401
|
-
self, data_files: List[DataFile], properties: Optional[Dict[str, str]] = None
|
|
402
|
-
) -> Snapshot:
|
|
403
|
-
"""Create snapshot object.
|
|
404
|
-
|
|
405
|
-
Args:
|
|
406
|
-
data_files: List of data files
|
|
407
|
-
properties: Optional table properties (used to populate snapshot summary)
|
|
408
|
-
|
|
409
|
-
Returns:
|
|
410
|
-
Snapshot object
|
|
411
|
-
"""
|
|
412
|
-
properties = properties or {}
|
|
413
|
-
total_records = sum(df.record_count for df in data_files)
|
|
414
|
-
manifest_filename = f"snap-1-1-{uuid.uuid4()}.avro"
|
|
415
|
-
|
|
416
|
-
# Build summary with standard fields + huggingface metadata
|
|
417
|
-
summary_fields = {
|
|
418
|
-
"added-data-files": str(len(data_files)),
|
|
419
|
-
"added-records": str(total_records),
|
|
420
|
-
"total-data-files": str(len(data_files)),
|
|
421
|
-
"total-records": str(total_records),
|
|
422
|
-
}
|
|
423
|
-
|
|
424
|
-
# Add hf.* fields from properties to snapshot summary
|
|
425
|
-
if "hf.dataset.repo" in properties:
|
|
426
|
-
summary_fields["hf.dataset.repo"] = properties["hf.dataset.repo"]
|
|
427
|
-
if "hf.dataset.config" in properties:
|
|
428
|
-
summary_fields["hf.dataset.config"] = properties["hf.dataset.config"]
|
|
429
|
-
if "hf.dataset.revision" in properties:
|
|
430
|
-
revision = properties["hf.dataset.revision"]
|
|
431
|
-
summary_fields["hf.dataset.revision"] = revision
|
|
432
|
-
# Add short revision (first 7 chars)
|
|
433
|
-
summary_fields["hf.dataset.revision.short"] = revision[:7]
|
|
434
|
-
|
|
435
|
-
return Snapshot( # type: ignore[call-arg]
|
|
436
|
-
snapshot_id=1,
|
|
437
|
-
parent_snapshot_id=None,
|
|
438
|
-
sequence_number=INITIAL_SEQUENCE_NUMBER,
|
|
439
|
-
timestamp_ms=1,
|
|
440
|
-
manifest_list=f"{self.base_uri.rstrip('/')}/metadata/{manifest_filename}",
|
|
441
|
-
summary=Summary(
|
|
442
|
-
operation=Operation.APPEND,
|
|
443
|
-
**summary_fields,
|
|
444
|
-
),
|
|
445
|
-
schema_id=self.schema.schema_id,
|
|
446
|
-
)
|
|
447
|
-
|
|
448
|
-
def _write_manifest_list(self, snapshot: Snapshot, manifest_files: List):
|
|
449
|
-
"""Write manifest list file.
|
|
450
|
-
|
|
451
|
-
Args:
|
|
452
|
-
snapshot: Snapshot object
|
|
453
|
-
manifest_files: List of manifest files
|
|
454
|
-
"""
|
|
455
|
-
# Get filename from the snapshot's manifest_list path and write to staging directory
|
|
456
|
-
manifest_list_filename = Path(snapshot.manifest_list).name
|
|
457
|
-
manifest_list_write_path = self.metadata_dir / manifest_list_filename
|
|
458
|
-
manifest_list_output = self.file_io.new_output(str(manifest_list_write_path))
|
|
459
|
-
|
|
460
|
-
with write_manifest_list(
|
|
461
|
-
format_version=2,
|
|
462
|
-
output_file=manifest_list_output,
|
|
463
|
-
snapshot_id=snapshot.snapshot_id,
|
|
464
|
-
parent_snapshot_id=snapshot.parent_snapshot_id,
|
|
465
|
-
sequence_number=snapshot.sequence_number,
|
|
466
|
-
avro_compression="deflate",
|
|
467
|
-
) as manifest_list_writer:
|
|
468
|
-
manifest_list_writer.add_manifests(manifest_files)
|
|
469
|
-
|
|
470
|
-
def _create_table_metadata(
|
|
471
|
-
self,
|
|
472
|
-
snapshot: Snapshot,
|
|
473
|
-
table_uuid: str,
|
|
474
|
-
properties: Dict[str, str],
|
|
475
|
-
) -> TableMetadataV2:
|
|
476
|
-
"""Create table metadata object.
|
|
477
|
-
|
|
478
|
-
Args:
|
|
479
|
-
snapshot: Snapshot object
|
|
480
|
-
table_uuid: UUID for the table
|
|
481
|
-
properties: Table properties
|
|
482
|
-
|
|
483
|
-
Returns:
|
|
484
|
-
TableMetadataV2 object
|
|
485
|
-
"""
|
|
486
|
-
# Create initial metadata
|
|
487
|
-
metadata = new_table_metadata(
|
|
488
|
-
schema=self.schema,
|
|
489
|
-
partition_spec=self.partition_spec,
|
|
490
|
-
sort_order=UNSORTED_SORT_ORDER,
|
|
491
|
-
location=self.base_uri,
|
|
492
|
-
properties=properties,
|
|
493
|
-
table_uuid=uuid.UUID(table_uuid),
|
|
494
|
-
)
|
|
495
|
-
|
|
496
|
-
# Update partition spec with correct field IDs if partitioned
|
|
497
|
-
if self.partition_spec != UNPARTITIONED_PARTITION_SPEC:
|
|
498
|
-
# Get the reassigned schema from metadata
|
|
499
|
-
reassigned_schema = metadata.schema()
|
|
500
|
-
split_field = reassigned_schema.find_field("split")
|
|
501
|
-
if split_field:
|
|
502
|
-
# Create partition spec with correct source_id
|
|
503
|
-
partition_spec_with_correct_ids = PartitionSpec(
|
|
504
|
-
PartitionField(
|
|
505
|
-
source_id=split_field.field_id,
|
|
506
|
-
field_id=1000,
|
|
507
|
-
transform=IdentityTransform(),
|
|
508
|
-
name="split",
|
|
509
|
-
),
|
|
510
|
-
spec_id=0,
|
|
511
|
-
)
|
|
512
|
-
# Update metadata with correct partition spec
|
|
513
|
-
metadata = TableMetadataV2( # type: ignore[call-arg]
|
|
514
|
-
location=metadata.location,
|
|
515
|
-
table_uuid=metadata.table_uuid,
|
|
516
|
-
last_updated_ms=metadata.last_updated_ms,
|
|
517
|
-
last_column_id=metadata.last_column_id,
|
|
518
|
-
schemas=metadata.schemas,
|
|
519
|
-
current_schema_id=metadata.current_schema_id,
|
|
520
|
-
partition_specs=[partition_spec_with_correct_ids],
|
|
521
|
-
default_spec_id=0,
|
|
522
|
-
last_partition_id=1000,
|
|
523
|
-
properties=metadata.properties,
|
|
524
|
-
current_snapshot_id=None,
|
|
525
|
-
snapshots=[],
|
|
526
|
-
snapshot_log=[],
|
|
527
|
-
metadata_log=[],
|
|
528
|
-
sort_orders=metadata.sort_orders,
|
|
529
|
-
default_sort_order_id=metadata.default_sort_order_id,
|
|
530
|
-
refs={},
|
|
531
|
-
format_version=2,
|
|
532
|
-
last_sequence_number=INITIAL_SEQUENCE_NUMBER,
|
|
533
|
-
)
|
|
534
|
-
|
|
535
|
-
# Update metadata with snapshot
|
|
536
|
-
return TableMetadataV2( # type: ignore[call-arg]
|
|
537
|
-
location=metadata.location,
|
|
538
|
-
table_uuid=metadata.table_uuid,
|
|
539
|
-
last_updated_ms=metadata.last_updated_ms,
|
|
540
|
-
last_column_id=metadata.last_column_id,
|
|
541
|
-
schemas=metadata.schemas,
|
|
542
|
-
current_schema_id=metadata.current_schema_id,
|
|
543
|
-
partition_specs=metadata.partition_specs,
|
|
544
|
-
default_spec_id=metadata.default_spec_id,
|
|
545
|
-
last_partition_id=metadata.last_partition_id,
|
|
546
|
-
properties=metadata.properties,
|
|
547
|
-
current_snapshot_id=snapshot.snapshot_id,
|
|
548
|
-
snapshots=[snapshot],
|
|
549
|
-
snapshot_log=[],
|
|
550
|
-
metadata_log=[],
|
|
551
|
-
sort_orders=metadata.sort_orders,
|
|
552
|
-
default_sort_order_id=metadata.default_sort_order_id,
|
|
553
|
-
refs={
|
|
554
|
-
"main": SnapshotRef( # type: ignore[call-arg]
|
|
555
|
-
snapshot_id=snapshot.snapshot_id,
|
|
556
|
-
type=SnapshotRefType.BRANCH,
|
|
557
|
-
)
|
|
558
|
-
},
|
|
559
|
-
format_version=2,
|
|
560
|
-
last_sequence_number=INITIAL_SEQUENCE_NUMBER,
|
|
561
|
-
)
|
|
562
|
-
|
|
563
|
-
def _write_metadata_file(self, metadata: TableMetadataV2) -> Path:
|
|
564
|
-
"""Write metadata file and version hint.
|
|
565
|
-
|
|
566
|
-
Args:
|
|
567
|
-
metadata: Table metadata object
|
|
568
|
-
|
|
569
|
-
Returns:
|
|
570
|
-
Path to the metadata file
|
|
571
|
-
"""
|
|
572
|
-
# Write metadata file - DuckDB expects v1.metadata.json format
|
|
573
|
-
metadata_file = self.metadata_dir / "v1.metadata.json"
|
|
574
|
-
with open(metadata_file, "w") as f:
|
|
575
|
-
f.write(metadata.model_dump_json(indent=2))
|
|
576
|
-
|
|
577
|
-
# Write version hint - contains the version number (1)
|
|
578
|
-
version_hint_file = self.metadata_dir / "version-hint.text"
|
|
579
|
-
with open(version_hint_file, "w") as f:
|
|
580
|
-
f.write("1")
|
|
581
|
-
|
|
582
|
-
logger.info(f"Wrote metadata to {metadata_file}")
|
|
583
|
-
return metadata_file
|
|
584
|
-
|
|
585
|
-
def append_snapshot_from_files(
|
|
586
|
-
self,
|
|
587
|
-
file_infos: List[FileInfo],
|
|
588
|
-
current_metadata: TableMetadataV2,
|
|
589
|
-
properties: Optional[Dict[str, str]] = None,
|
|
590
|
-
) -> Path:
|
|
591
|
-
"""Append a new snapshot to existing table metadata.
|
|
592
|
-
|
|
593
|
-
This method creates a new snapshot with updated files and writes
|
|
594
|
-
the new metadata version.
|
|
595
|
-
|
|
596
|
-
Args:
|
|
597
|
-
file_infos: List of FileInfo objects describing new data files
|
|
598
|
-
current_metadata: Current TableMetadataV2 object
|
|
599
|
-
properties: Optional updated table properties
|
|
600
|
-
|
|
601
|
-
Returns:
|
|
602
|
-
Path to the new metadata file
|
|
603
|
-
"""
|
|
604
|
-
logger.info(f"Appending snapshot with {len(file_infos)} files")
|
|
605
|
-
|
|
606
|
-
# Enrich file metadata
|
|
607
|
-
enriched_files = self._read_file_metadata(file_infos)
|
|
608
|
-
|
|
609
|
-
# Skip inheritance tracking for pure appends (new files only)
|
|
610
|
-
# The bridge layer already filtered to new files via revision diff,
|
|
611
|
-
# so no need to compare with previous data files.
|
|
612
|
-
# This saves 5-25 MB of manifest downloads for remote catalogs.
|
|
613
|
-
previous_data_files = None
|
|
614
|
-
|
|
615
|
-
# Calculate next IDs
|
|
616
|
-
next_snapshot_id = max(snap.snapshot_id for snap in current_metadata.snapshots) + 1
|
|
617
|
-
next_sequence_number = current_metadata.last_sequence_number + 1
|
|
618
|
-
|
|
619
|
-
# Create DataFile entries (all get new sequence number)
|
|
620
|
-
data_files = self._create_data_files(
|
|
621
|
-
enriched_files, next_sequence_number, previous_data_files
|
|
622
|
-
)
|
|
623
|
-
|
|
624
|
-
# Merge properties first (needed for snapshot summary)
|
|
625
|
-
merged_properties = {**current_metadata.properties}
|
|
626
|
-
if properties:
|
|
627
|
-
merged_properties.update(properties)
|
|
628
|
-
|
|
629
|
-
# Write manifest for new files only
|
|
630
|
-
new_manifest_file = self._write_manifest_with_ids(
|
|
631
|
-
data_files, next_snapshot_id, next_sequence_number
|
|
632
|
-
)
|
|
633
|
-
|
|
634
|
-
# Create snapshot (with properties for summary fields)
|
|
635
|
-
snapshot = self._create_snapshot_with_ids(
|
|
636
|
-
data_files, next_snapshot_id, next_sequence_number, merged_properties
|
|
637
|
-
)
|
|
638
|
-
|
|
639
|
-
# Fast append: reuse previous manifests + add new manifest
|
|
640
|
-
previous_manifests = self._get_previous_manifests(current_metadata)
|
|
641
|
-
if previous_manifests:
|
|
642
|
-
all_manifest_files = previous_manifests + [new_manifest_file]
|
|
643
|
-
else:
|
|
644
|
-
all_manifest_files = [new_manifest_file]
|
|
645
|
-
|
|
646
|
-
# Write manifest list with all manifests
|
|
647
|
-
self._write_manifest_list(snapshot, all_manifest_files)
|
|
648
|
-
|
|
649
|
-
# Create updated metadata
|
|
650
|
-
updated_metadata = TableMetadataV2( # type: ignore[call-arg]
|
|
651
|
-
location=current_metadata.location,
|
|
652
|
-
table_uuid=current_metadata.table_uuid,
|
|
653
|
-
last_updated_ms=int(time.time() * 1000),
|
|
654
|
-
last_column_id=current_metadata.last_column_id,
|
|
655
|
-
schemas=current_metadata.schemas,
|
|
656
|
-
current_schema_id=current_metadata.current_schema_id,
|
|
657
|
-
partition_specs=current_metadata.partition_specs,
|
|
658
|
-
default_spec_id=current_metadata.default_spec_id,
|
|
659
|
-
last_partition_id=current_metadata.last_partition_id,
|
|
660
|
-
properties=merged_properties,
|
|
661
|
-
current_snapshot_id=snapshot.snapshot_id,
|
|
662
|
-
snapshots=list(current_metadata.snapshots) + [snapshot],
|
|
663
|
-
snapshot_log=current_metadata.snapshot_log,
|
|
664
|
-
metadata_log=current_metadata.metadata_log,
|
|
665
|
-
sort_orders=current_metadata.sort_orders,
|
|
666
|
-
default_sort_order_id=current_metadata.default_sort_order_id,
|
|
667
|
-
refs={
|
|
668
|
-
"main": SnapshotRef( # type: ignore[call-arg]
|
|
669
|
-
snapshot_id=snapshot.snapshot_id,
|
|
670
|
-
type=SnapshotRefType.BRANCH,
|
|
671
|
-
)
|
|
672
|
-
},
|
|
673
|
-
format_version=2,
|
|
674
|
-
last_sequence_number=next_sequence_number,
|
|
675
|
-
)
|
|
676
|
-
|
|
677
|
-
# Write new metadata file
|
|
678
|
-
return self._write_metadata_version(updated_metadata, next_sequence_number)
|
|
679
|
-
|
|
680
|
-
def _write_manifest_with_ids(
|
|
681
|
-
self, data_files: List[DataFile], snapshot_id: int, sequence_number: int
|
|
682
|
-
):
|
|
683
|
-
"""Write manifest file with specific IDs.
|
|
684
|
-
|
|
685
|
-
Args:
|
|
686
|
-
data_files: List of DataFile objects
|
|
687
|
-
snapshot_id: Snapshot ID
|
|
688
|
-
sequence_number: Sequence number
|
|
689
|
-
|
|
690
|
-
Returns:
|
|
691
|
-
ManifestFile object
|
|
692
|
-
"""
|
|
693
|
-
manifest_filename = f"{uuid.uuid4()}.avro"
|
|
694
|
-
manifest_write_path = self.metadata_dir / manifest_filename
|
|
695
|
-
manifest_uri = f"{self.base_uri.rstrip('/')}/metadata/{manifest_filename}"
|
|
696
|
-
output_file = self.file_io.new_output(str(manifest_write_path))
|
|
697
|
-
|
|
698
|
-
with write_manifest(
|
|
699
|
-
format_version=2,
|
|
700
|
-
spec=self.partition_spec,
|
|
701
|
-
schema=self.schema,
|
|
702
|
-
output_file=output_file,
|
|
703
|
-
snapshot_id=snapshot_id,
|
|
704
|
-
avro_compression="deflate",
|
|
705
|
-
) as writer:
|
|
706
|
-
for data_file in data_files:
|
|
707
|
-
entry = ManifestEntry.from_args(
|
|
708
|
-
status=ManifestEntryStatus.ADDED,
|
|
709
|
-
snapshot_id=snapshot_id,
|
|
710
|
-
sequence_number=sequence_number,
|
|
711
|
-
file_sequence_number=sequence_number,
|
|
712
|
-
data_file=data_file,
|
|
713
|
-
)
|
|
714
|
-
writer.add_entry(entry)
|
|
715
|
-
|
|
716
|
-
original_manifest = writer.to_manifest_file()
|
|
717
|
-
|
|
718
|
-
# Create a new ManifestFile with URI path for metadata references
|
|
719
|
-
return ManifestFile.from_args(
|
|
720
|
-
manifest_path=manifest_uri,
|
|
721
|
-
manifest_length=original_manifest.manifest_length,
|
|
722
|
-
partition_spec_id=original_manifest.partition_spec_id,
|
|
723
|
-
content=original_manifest.content,
|
|
724
|
-
sequence_number=original_manifest.sequence_number,
|
|
725
|
-
min_sequence_number=original_manifest.min_sequence_number,
|
|
726
|
-
added_snapshot_id=original_manifest.added_snapshot_id,
|
|
727
|
-
added_files_count=original_manifest.added_files_count,
|
|
728
|
-
existing_files_count=original_manifest.existing_files_count,
|
|
729
|
-
deleted_files_count=original_manifest.deleted_files_count,
|
|
730
|
-
added_rows_count=original_manifest.added_rows_count,
|
|
731
|
-
existing_rows_count=original_manifest.existing_rows_count,
|
|
732
|
-
deleted_rows_count=original_manifest.deleted_rows_count,
|
|
733
|
-
partitions=original_manifest.partitions,
|
|
734
|
-
key_metadata=original_manifest.key_metadata,
|
|
735
|
-
)
|
|
736
|
-
|
|
737
|
-
def _create_snapshot_with_ids(
|
|
738
|
-
self,
|
|
739
|
-
data_files: List[DataFile],
|
|
740
|
-
snapshot_id: int,
|
|
741
|
-
sequence_number: int,
|
|
742
|
-
properties: Optional[Dict[str, str]] = None,
|
|
743
|
-
) -> Snapshot:
|
|
744
|
-
"""Create snapshot with specific IDs.
|
|
745
|
-
|
|
746
|
-
Args:
|
|
747
|
-
data_files: List of DataFile objects
|
|
748
|
-
snapshot_id: Snapshot ID
|
|
749
|
-
sequence_number: Sequence number
|
|
750
|
-
properties: Optional table properties (used to populate snapshot summary)
|
|
751
|
-
|
|
752
|
-
Returns:
|
|
753
|
-
Snapshot object
|
|
754
|
-
"""
|
|
755
|
-
properties = properties or {}
|
|
756
|
-
total_records = sum(df.record_count for df in data_files)
|
|
757
|
-
|
|
758
|
-
manifest_filename = f"snap-{snapshot_id}-{sequence_number}-{uuid.uuid4()}.avro"
|
|
759
|
-
|
|
760
|
-
# Build summary with standard fields + huggingface metadata
|
|
761
|
-
summary_fields = {
|
|
762
|
-
"added-data-files": str(len(data_files)),
|
|
763
|
-
"added-records": str(total_records),
|
|
764
|
-
"total-data-files": str(len(data_files)),
|
|
765
|
-
"total-records": str(total_records),
|
|
766
|
-
}
|
|
767
|
-
|
|
768
|
-
# Add hf.* fields from properties to snapshot summary
|
|
769
|
-
if "hf.dataset.repo" in properties:
|
|
770
|
-
summary_fields["hf.dataset.repo"] = properties["hf.dataset.repo"]
|
|
771
|
-
if "hf.dataset.config" in properties:
|
|
772
|
-
summary_fields["hf.dataset.config"] = properties["hf.dataset.config"]
|
|
773
|
-
if "hf.dataset.revision" in properties:
|
|
774
|
-
revision = properties["hf.dataset.revision"]
|
|
775
|
-
summary_fields["hf.dataset.revision"] = revision
|
|
776
|
-
# Add short revision (first 7 chars)
|
|
777
|
-
summary_fields["hf.dataset.revision.short"] = revision[:7]
|
|
778
|
-
|
|
779
|
-
return Snapshot( # type: ignore[call-arg]
|
|
780
|
-
snapshot_id=snapshot_id,
|
|
781
|
-
parent_snapshot_id=snapshot_id - 1,
|
|
782
|
-
sequence_number=sequence_number,
|
|
783
|
-
timestamp_ms=int(uuid.uuid4().time_low),
|
|
784
|
-
manifest_list=f"{self.base_uri.rstrip('/')}/metadata/{manifest_filename}",
|
|
785
|
-
summary=Summary(
|
|
786
|
-
operation=Operation.APPEND,
|
|
787
|
-
**summary_fields,
|
|
788
|
-
),
|
|
789
|
-
schema_id=self.schema.schema_id,
|
|
790
|
-
)
|
|
791
|
-
|
|
792
|
-
def _write_metadata_version(self, metadata: TableMetadataV2, version: int) -> Path:
|
|
793
|
-
"""Write a specific metadata version.
|
|
794
|
-
|
|
795
|
-
Args:
|
|
796
|
-
metadata: TableMetadataV2 object
|
|
797
|
-
version: Version number
|
|
798
|
-
|
|
799
|
-
Returns:
|
|
800
|
-
Path to the metadata file
|
|
801
|
-
"""
|
|
802
|
-
# Write metadata file
|
|
803
|
-
metadata_file = self.metadata_dir / f"v{version}.metadata.json"
|
|
804
|
-
with open(metadata_file, "w") as f:
|
|
805
|
-
f.write(metadata.model_dump_json(indent=2))
|
|
806
|
-
|
|
807
|
-
# Update version hint
|
|
808
|
-
version_hint_file = self.metadata_dir / "version-hint.text"
|
|
809
|
-
with open(version_hint_file, "w") as f:
|
|
810
|
-
f.write(str(version))
|
|
811
|
-
|
|
812
|
-
logger.info(f"Wrote metadata version {version} to {metadata_file}")
|
|
813
|
-
return metadata_file
|