opteryx-catalog 0.4.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opteryx-catalog might be problematic. Click here for more details.
- opteryx_catalog/__init__.py +31 -0
- opteryx_catalog/catalog/__init__.py +4 -0
- opteryx_catalog/catalog/compaction.py +529 -0
- opteryx_catalog/catalog/dataset.py +1201 -0
- opteryx_catalog/catalog/manifest.py +438 -0
- opteryx_catalog/catalog/metadata.py +81 -0
- opteryx_catalog/catalog/metastore.py +68 -0
- opteryx_catalog/catalog/view.py +12 -0
- opteryx_catalog/exceptions.py +38 -0
- opteryx_catalog/iops/__init__.py +6 -0
- opteryx_catalog/iops/base.py +42 -0
- opteryx_catalog/iops/fileio.py +125 -0
- opteryx_catalog/iops/gcs.py +255 -0
- opteryx_catalog/opteryx_catalog.py +979 -0
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- opteryx_catalog-0.4.13.dist-info/METADATA +466 -0
- opteryx_catalog-0.4.13.dist-info/RECORD +28 -0
- opteryx_catalog-0.4.13.dist-info/WHEEL +5 -0
- opteryx_catalog-0.4.13.dist-info/licenses/LICENSE +201 -0
- opteryx_catalog-0.4.13.dist-info/top_level.txt +3 -0
- scripts/create_dataset.py +201 -0
- scripts/read_dataset.py +268 -0
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +29 -0
- tests/test_import.py +5 -0
- tests/test_pyproject.py +8 -0
- tests/test_webhooks.py +177 -0
|
@@ -0,0 +1,1201 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any
|
|
8
|
+
from typing import Iterable
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from .manifest import ParquetManifestEntry
|
|
12
|
+
from .manifest import build_parquet_manifest_entry
|
|
13
|
+
from .manifest import build_parquet_manifest_minmax_entry
|
|
14
|
+
from .metadata import DatasetMetadata
|
|
15
|
+
from .metadata import Snapshot
|
|
16
|
+
from .metastore import Dataset
|
|
17
|
+
|
|
18
|
+
# Stable node identifier for this process (hex-mac-hex-pid)
|
|
19
|
+
_NODE = f"{uuid.getnode():x}-{os.getpid():x}"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Datafile:
|
|
24
|
+
"""Wrapper for a manifest entry representing a data file."""
|
|
25
|
+
|
|
26
|
+
entry: dict
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def file_path(self) -> Optional[str]:
|
|
30
|
+
return self.entry.get("file_path")
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def record_count(self) -> int:
|
|
34
|
+
return int(self.entry.get("record_count") or 0)
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def file_size_in_bytes(self) -> int:
|
|
38
|
+
return int(self.entry.get("file_size_in_bytes") or 0)
|
|
39
|
+
|
|
40
|
+
def to_dict(self) -> dict:
|
|
41
|
+
return dict(self.entry)
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def min_k_hashes(self) -> list:
|
|
45
|
+
return self.entry.get("min_k_hashes") or []
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def histogram_counts(self) -> list:
|
|
49
|
+
return self.entry.get("histogram_counts") or []
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def histogram_bins(self) -> int:
|
|
53
|
+
return int(self.entry.get("histogram_bins") or 0)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def min_values(self) -> list:
|
|
57
|
+
return self.entry.get("min_values") or []
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def max_values(self) -> list:
|
|
61
|
+
return self.entry.get("max_values") or []
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class SimpleDataset(Dataset):
|
|
66
|
+
identifier: str
|
|
67
|
+
_metadata: DatasetMetadata
|
|
68
|
+
io: Any = None
|
|
69
|
+
catalog: Any = None
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def metadata(self) -> DatasetMetadata:
|
|
73
|
+
return self._metadata
|
|
74
|
+
|
|
75
|
+
def _next_sequence_number(self) -> int:
|
|
76
|
+
"""Calculate the next sequence number.
|
|
77
|
+
|
|
78
|
+
Uses the current snapshot's sequence number + 1. Works efficiently
|
|
79
|
+
with load_history=False since we only need the most recent snapshot,
|
|
80
|
+
not the full history.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
The next sequence number (current snapshot's sequence + 1, or 1 if no snapshots).
|
|
84
|
+
"""
|
|
85
|
+
if not self.metadata.snapshots:
|
|
86
|
+
# No snapshots yet - this is the first one
|
|
87
|
+
return 1
|
|
88
|
+
|
|
89
|
+
# Get the current (most recent) snapshot - should have the highest sequence number
|
|
90
|
+
current = self.snapshot()
|
|
91
|
+
if current:
|
|
92
|
+
seq = getattr(current, "sequence_number", None)
|
|
93
|
+
return int(seq) + 1 if seq is not None else 1
|
|
94
|
+
|
|
95
|
+
def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Snapshot]:
|
|
96
|
+
"""Return a Snapshot.
|
|
97
|
+
|
|
98
|
+
- If `snapshot_id` is None, return the in-memory current snapshot.
|
|
99
|
+
- If a `snapshot_id` is provided, prefer a Firestore lookup via the
|
|
100
|
+
attached `catalog` (O(1) document get). Fall back to the in-memory
|
|
101
|
+
`metadata.snapshots` list only when no catalog is attached or the
|
|
102
|
+
remote lookup fails.
|
|
103
|
+
"""
|
|
104
|
+
# Current snapshot: keep in memory for fast access
|
|
105
|
+
if snapshot_id is None:
|
|
106
|
+
return self.metadata.current_snapshot()
|
|
107
|
+
|
|
108
|
+
# Try Firestore document lookup when catalog attached
|
|
109
|
+
if self.catalog:
|
|
110
|
+
try:
|
|
111
|
+
collection, dataset_name = self.identifier.split(".")
|
|
112
|
+
doc = (
|
|
113
|
+
self.catalog._dataset_doc_ref(collection, dataset_name)
|
|
114
|
+
.collection("snapshots")
|
|
115
|
+
.document(str(snapshot_id))
|
|
116
|
+
.get()
|
|
117
|
+
)
|
|
118
|
+
if doc.exists:
|
|
119
|
+
sd = doc.to_dict() or {}
|
|
120
|
+
snap = Snapshot(
|
|
121
|
+
snapshot_id=int(sd.get("snapshot-id") or snapshot_id),
|
|
122
|
+
timestamp_ms=int(sd.get("timestamp-ms", 0)),
|
|
123
|
+
author=sd.get("author"),
|
|
124
|
+
sequence_number=sd.get("sequence-number", 0),
|
|
125
|
+
user_created=sd.get("user-created"),
|
|
126
|
+
manifest_list=sd.get("manifest"),
|
|
127
|
+
schema_id=sd.get("schema-id"),
|
|
128
|
+
summary=sd.get("summary", {}),
|
|
129
|
+
operation_type=sd.get("operation-type"),
|
|
130
|
+
parent_snapshot_id=sd.get("parent-snapshot-id"),
|
|
131
|
+
commit_message=sd.get("commit-message"),
|
|
132
|
+
)
|
|
133
|
+
return snap
|
|
134
|
+
except Exception:
|
|
135
|
+
# Be conservative: fall through to in-memory fallback
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
# Fallback: search in-memory snapshots (only used when no catalog)
|
|
139
|
+
for s in self.metadata.snapshots:
|
|
140
|
+
if s.snapshot_id == snapshot_id:
|
|
141
|
+
return s
|
|
142
|
+
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
def _get_node(self) -> str:
|
|
146
|
+
"""Return the stable node identifier for this process.
|
|
147
|
+
|
|
148
|
+
Uses a module-level constant to avoid per-instance hashing/caching.
|
|
149
|
+
"""
|
|
150
|
+
return _NODE
|
|
151
|
+
|
|
152
|
+
def snapshots(self) -> Iterable[Snapshot]:
|
|
153
|
+
return list(self.metadata.snapshots)
|
|
154
|
+
|
|
155
|
+
def schema(self, schema_id: Optional[str] = None) -> Optional[dict]:
|
|
156
|
+
"""Return a stored schema description.
|
|
157
|
+
|
|
158
|
+
If `schema_id` is None, return the current schema (by
|
|
159
|
+
`metadata.current_schema_id` or last-known schema). If a
|
|
160
|
+
specific `schema_id` is provided, attempt to find it in the
|
|
161
|
+
in-memory `metadata.schemas` list and, failing that, fetch it
|
|
162
|
+
from the catalog's `schemas` subcollection when a catalog is
|
|
163
|
+
attached.
|
|
164
|
+
|
|
165
|
+
Returns the stored schema dict (contains keys like `schema_id`,
|
|
166
|
+
`columns`, `timestamp-ms`, etc.) or None if not found.
|
|
167
|
+
"""
|
|
168
|
+
# Determine which schema id to use
|
|
169
|
+
sid = schema_id or self.metadata.current_schema_id
|
|
170
|
+
|
|
171
|
+
# If no sid and a raw schema is stored on the metadata, return it
|
|
172
|
+
if sid is None:
|
|
173
|
+
return getattr(self.metadata, "schema", None)
|
|
174
|
+
|
|
175
|
+
# Fast path: if this is the current schema id, prefer the cached
|
|
176
|
+
# current schema (99% case) rather than scanning the entire list.
|
|
177
|
+
sdict = None
|
|
178
|
+
if sid == self.metadata.current_schema_id:
|
|
179
|
+
if getattr(self.metadata, "schemas", None):
|
|
180
|
+
last = self.metadata.schemas[-1]
|
|
181
|
+
if last.get("schema_id") == sid:
|
|
182
|
+
sdict = last
|
|
183
|
+
else:
|
|
184
|
+
# If a raw schema is stored directly on metadata, use it.
|
|
185
|
+
raw = getattr(self.metadata, "schema", None)
|
|
186
|
+
if raw is not None:
|
|
187
|
+
sdict = {"schema_id": sid, "columns": raw}
|
|
188
|
+
|
|
189
|
+
# If not the current schema, or cached current not present,
|
|
190
|
+
# prefer to load the schema document from the backend (O(1) doc get).
|
|
191
|
+
if sdict is None and self.catalog:
|
|
192
|
+
try:
|
|
193
|
+
collection, dataset_name = self.identifier.split(".")
|
|
194
|
+
doc = (
|
|
195
|
+
self.catalog._dataset_doc_ref(collection, dataset_name)
|
|
196
|
+
.collection("schemas")
|
|
197
|
+
.document(sid)
|
|
198
|
+
.get()
|
|
199
|
+
)
|
|
200
|
+
sdict = doc.to_dict() or None
|
|
201
|
+
except Exception:
|
|
202
|
+
sdict = None
|
|
203
|
+
|
|
204
|
+
# As a last-resort when no catalog is attached, fall back to an
|
|
205
|
+
# in-memory search for compatibility (offline/unit-test mode).
|
|
206
|
+
if sdict is None and not self.catalog:
|
|
207
|
+
for s in self.metadata.schemas or []:
|
|
208
|
+
if s.get("schema_id") == sid:
|
|
209
|
+
sdict = s
|
|
210
|
+
break
|
|
211
|
+
|
|
212
|
+
if sdict is None:
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
# Try to construct an Orso RelationSchema
|
|
216
|
+
from orso.schema import FlatColumn
|
|
217
|
+
from orso.schema import RelationSchema
|
|
218
|
+
|
|
219
|
+
# If metadata stored a raw schema
|
|
220
|
+
raw = sdict.get("columns")
|
|
221
|
+
|
|
222
|
+
columns = [
|
|
223
|
+
FlatColumn(
|
|
224
|
+
name=c.get("name"),
|
|
225
|
+
type=c.get("type"),
|
|
226
|
+
element_type=c.get("element-type"),
|
|
227
|
+
precision=c.get("precision"),
|
|
228
|
+
scale=c.get("scale"),
|
|
229
|
+
)
|
|
230
|
+
for c in raw
|
|
231
|
+
]
|
|
232
|
+
orso_schema = RelationSchema(name=self.identifier, columns=columns)
|
|
233
|
+
return orso_schema
|
|
234
|
+
|
|
235
|
+
def append(self, table: Any, author: str = None, commit_message: Optional[str] = None):
|
|
236
|
+
"""Append a pyarrow.Table:
|
|
237
|
+
|
|
238
|
+
- write a Parquet data file via `self.io`
|
|
239
|
+
- create a simple Parquet manifest (one entry)
|
|
240
|
+
- persist manifest and snapshot metadata using the attached `catalog`
|
|
241
|
+
"""
|
|
242
|
+
import pyarrow as pa
|
|
243
|
+
import pyarrow.parquet as pq
|
|
244
|
+
|
|
245
|
+
snapshot_id = int(time.time() * 1000)
|
|
246
|
+
|
|
247
|
+
if not hasattr(table, "schema"):
|
|
248
|
+
raise TypeError("append() expects a pyarrow.Table-like object")
|
|
249
|
+
|
|
250
|
+
# Write table and build manifest entry
|
|
251
|
+
manifest_entry = self._write_table_and_build_entry(table)
|
|
252
|
+
entries = [manifest_entry.to_dict()]
|
|
253
|
+
|
|
254
|
+
# persist manifest: for append, merge previous manifest entries
|
|
255
|
+
# with the new entries so the snapshot's manifest is cumulative.
|
|
256
|
+
manifest_path = None
|
|
257
|
+
if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
|
|
258
|
+
merged_entries = list(entries)
|
|
259
|
+
|
|
260
|
+
# If there is a previous snapshot with a manifest, try to read
|
|
261
|
+
# it and prepend its entries. Any read error is non-fatal and we
|
|
262
|
+
# fall back to writing only the new entries.
|
|
263
|
+
prev_snap = self.snapshot(None)
|
|
264
|
+
if prev_snap and getattr(prev_snap, "manifest_list", None):
|
|
265
|
+
prev_manifest_path = prev_snap.manifest_list
|
|
266
|
+
try:
|
|
267
|
+
# Prefer FileIO when available
|
|
268
|
+
inp = self.io.new_input(prev_manifest_path)
|
|
269
|
+
with inp.open() as f:
|
|
270
|
+
prev_data = f.read()
|
|
271
|
+
import pyarrow as pa
|
|
272
|
+
import pyarrow.parquet as pq
|
|
273
|
+
|
|
274
|
+
prev_table = pq.read_table(pa.BufferReader(prev_data))
|
|
275
|
+
prev_rows = prev_table.to_pylist()
|
|
276
|
+
merged_entries = prev_rows + merged_entries
|
|
277
|
+
except Exception:
|
|
278
|
+
# If we can't read the previous manifest, continue with
|
|
279
|
+
# just the new entries (don't fail the append).
|
|
280
|
+
pass
|
|
281
|
+
|
|
282
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
283
|
+
snapshot_id, merged_entries, self.metadata.location
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# snapshot metadata
|
|
287
|
+
if author is None:
|
|
288
|
+
raise ValueError("author must be provided when appending to a dataset")
|
|
289
|
+
# update metadata author/timestamp for this append
|
|
290
|
+
self.metadata.author = author
|
|
291
|
+
self.metadata.timestamp_ms = snapshot_id
|
|
292
|
+
# default commit message
|
|
293
|
+
if commit_message is None:
|
|
294
|
+
commit_message = f"commit by {author}"
|
|
295
|
+
|
|
296
|
+
recs = int(table.num_rows)
|
|
297
|
+
fsize = int(getattr(manifest_entry, "file_size_in_bytes", 0))
|
|
298
|
+
# Calculate uncompressed size from the manifest entry
|
|
299
|
+
added_data_size = manifest_entry.uncompressed_size_in_bytes
|
|
300
|
+
added_data_files = 1
|
|
301
|
+
added_files_size = fsize
|
|
302
|
+
added_records = recs
|
|
303
|
+
deleted_data_files = 0
|
|
304
|
+
deleted_files_size = 0
|
|
305
|
+
deleted_data_size = 0
|
|
306
|
+
deleted_records = 0
|
|
307
|
+
|
|
308
|
+
prev = self.snapshot()
|
|
309
|
+
if prev and prev.summary:
|
|
310
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
311
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
312
|
+
prev_total_data_size = int(prev.summary.get("total-data-size", 0))
|
|
313
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
314
|
+
else:
|
|
315
|
+
prev_total_files = 0
|
|
316
|
+
prev_total_size = 0
|
|
317
|
+
prev_total_data_size = 0
|
|
318
|
+
prev_total_records = 0
|
|
319
|
+
|
|
320
|
+
total_data_files = prev_total_files + added_data_files - deleted_data_files
|
|
321
|
+
total_files_size = prev_total_size + added_files_size - deleted_files_size
|
|
322
|
+
total_data_size = prev_total_data_size + added_data_size - deleted_data_size
|
|
323
|
+
total_records = prev_total_records + added_records - deleted_records
|
|
324
|
+
|
|
325
|
+
summary = {
|
|
326
|
+
"added-data-files": added_data_files,
|
|
327
|
+
"added-files-size": added_files_size,
|
|
328
|
+
"added-data-size": added_data_size,
|
|
329
|
+
"added-records": added_records,
|
|
330
|
+
"deleted-data-files": deleted_data_files,
|
|
331
|
+
"deleted-files-size": deleted_files_size,
|
|
332
|
+
"deleted-data-size": deleted_data_size,
|
|
333
|
+
"deleted-records": deleted_records,
|
|
334
|
+
"total-data-files": total_data_files,
|
|
335
|
+
"total-files-size": total_files_size,
|
|
336
|
+
"total-data-size": total_data_size,
|
|
337
|
+
"total-records": total_records,
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
# sequence number
|
|
341
|
+
try:
|
|
342
|
+
next_seq = self._next_sequence_number()
|
|
343
|
+
except Exception:
|
|
344
|
+
next_seq = 1
|
|
345
|
+
|
|
346
|
+
parent_id = self.metadata.current_snapshot_id
|
|
347
|
+
|
|
348
|
+
snap = Snapshot(
|
|
349
|
+
snapshot_id=snapshot_id,
|
|
350
|
+
timestamp_ms=snapshot_id,
|
|
351
|
+
author=author,
|
|
352
|
+
sequence_number=next_seq,
|
|
353
|
+
user_created=True,
|
|
354
|
+
operation_type="append",
|
|
355
|
+
parent_snapshot_id=parent_id,
|
|
356
|
+
manifest_list=manifest_path,
|
|
357
|
+
schema_id=self.metadata.current_schema_id,
|
|
358
|
+
commit_message=commit_message,
|
|
359
|
+
summary=summary,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
self.metadata.snapshots.append(snap)
|
|
363
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
364
|
+
|
|
365
|
+
# persist metadata (let errors propagate)
|
|
366
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
367
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
368
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
369
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
370
|
+
|
|
371
|
+
def _write_table_and_build_entry(self, table: Any):
|
|
372
|
+
"""Write a PyArrow table to storage and return a ParquetManifestEntry.
|
|
373
|
+
|
|
374
|
+
This centralizes the IO and manifest construction so other operations
|
|
375
|
+
(e.g. `overwrite`) can reuse the same behavior as `append`.
|
|
376
|
+
"""
|
|
377
|
+
# Write parquet file with collision-resistant name
|
|
378
|
+
fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
|
|
379
|
+
data_path = f"{self.metadata.location}/data/{fname}"
|
|
380
|
+
|
|
381
|
+
import pyarrow as pa
|
|
382
|
+
import pyarrow.parquet as pq
|
|
383
|
+
|
|
384
|
+
buf = pa.BufferOutputStream()
|
|
385
|
+
pq.write_table(table, buf, compression="zstd")
|
|
386
|
+
pdata = buf.getvalue().to_pybytes()
|
|
387
|
+
|
|
388
|
+
out = self.io.new_output(data_path).create()
|
|
389
|
+
out.write(pdata)
|
|
390
|
+
out.close()
|
|
391
|
+
|
|
392
|
+
# Build manifest entry with statistics
|
|
393
|
+
manifest_entry = build_parquet_manifest_entry(table, data_path, len(pdata))
|
|
394
|
+
return manifest_entry
|
|
395
|
+
|
|
396
|
+
def overwrite(self, table: Any, author: str = None, commit_message: Optional[str] = None):
|
|
397
|
+
"""Replace the dataset entirely with `table` in a single snapshot.
|
|
398
|
+
|
|
399
|
+
Semantics:
|
|
400
|
+
- Write the provided table as new data file(s)
|
|
401
|
+
- Create a new parquet manifest that contains only the new entries
|
|
402
|
+
- Create a snapshot that records previous files as deleted and the
|
|
403
|
+
new files as added (logical replace)
|
|
404
|
+
"""
|
|
405
|
+
# Similar validation as append
|
|
406
|
+
snapshot_id = int(time.time() * 1000)
|
|
407
|
+
|
|
408
|
+
if not hasattr(table, "schema"):
|
|
409
|
+
raise TypeError("overwrite() expects a pyarrow.Table-like object")
|
|
410
|
+
|
|
411
|
+
if author is None:
|
|
412
|
+
raise ValueError("author must be provided when overwriting a dataset")
|
|
413
|
+
|
|
414
|
+
# Write new data and build manifest entries (single table -> single entry)
|
|
415
|
+
manifest_entry = self._write_table_and_build_entry(table)
|
|
416
|
+
new_entries = [manifest_entry.to_dict()]
|
|
417
|
+
|
|
418
|
+
# Write manifest containing only the new entries
|
|
419
|
+
manifest_path = None
|
|
420
|
+
if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
|
|
421
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
422
|
+
snapshot_id, new_entries, self.metadata.location
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Compute deltas: previous manifest becomes deleted
|
|
426
|
+
prev = self.snapshot(None)
|
|
427
|
+
prev_total_files = 0
|
|
428
|
+
prev_total_size = 0
|
|
429
|
+
prev_total_data_size = 0
|
|
430
|
+
prev_total_records = 0
|
|
431
|
+
if prev and prev.summary:
|
|
432
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
433
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
434
|
+
prev_total_data_size = int(prev.summary.get("total-data-size", 0))
|
|
435
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
436
|
+
|
|
437
|
+
deleted_data_files = prev_total_files
|
|
438
|
+
deleted_files_size = prev_total_size
|
|
439
|
+
deleted_data_size = prev_total_data_size
|
|
440
|
+
deleted_records = prev_total_records
|
|
441
|
+
|
|
442
|
+
added_data_files = len(new_entries)
|
|
443
|
+
added_files_size = sum(e.get("file_size_in_bytes", 0) for e in new_entries)
|
|
444
|
+
added_data_size = sum(e.get("uncompressed_size_in_bytes", 0) for e in new_entries)
|
|
445
|
+
added_records = sum(e.get("record_count", 0) for e in new_entries)
|
|
446
|
+
|
|
447
|
+
total_data_files = added_data_files
|
|
448
|
+
total_files_size = added_files_size
|
|
449
|
+
total_data_size = added_data_size
|
|
450
|
+
total_records = added_records
|
|
451
|
+
|
|
452
|
+
summary = {
|
|
453
|
+
"added-data-files": added_data_files,
|
|
454
|
+
"added-files-size": added_files_size,
|
|
455
|
+
"added-data-size": added_data_size,
|
|
456
|
+
"added-records": added_records,
|
|
457
|
+
"deleted-data-files": deleted_data_files,
|
|
458
|
+
"deleted-files-size": deleted_files_size,
|
|
459
|
+
"deleted-data-size": deleted_data_size,
|
|
460
|
+
"deleted-records": deleted_records,
|
|
461
|
+
"total-data-files": total_data_files,
|
|
462
|
+
"total-files-size": total_files_size,
|
|
463
|
+
"total-data-size": total_data_size,
|
|
464
|
+
"total-records": total_records,
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
# sequence number
|
|
468
|
+
try:
|
|
469
|
+
next_seq = self._next_sequence_number()
|
|
470
|
+
except Exception:
|
|
471
|
+
next_seq = 1
|
|
472
|
+
|
|
473
|
+
parent_id = self.metadata.current_snapshot_id
|
|
474
|
+
|
|
475
|
+
if commit_message is None:
|
|
476
|
+
commit_message = f"overwrite by {author}"
|
|
477
|
+
|
|
478
|
+
snap = Snapshot(
|
|
479
|
+
snapshot_id=snapshot_id,
|
|
480
|
+
timestamp_ms=snapshot_id,
|
|
481
|
+
author=author,
|
|
482
|
+
sequence_number=next_seq,
|
|
483
|
+
user_created=True,
|
|
484
|
+
operation_type="overwrite",
|
|
485
|
+
parent_snapshot_id=parent_id,
|
|
486
|
+
manifest_list=manifest_path,
|
|
487
|
+
schema_id=self.metadata.current_schema_id,
|
|
488
|
+
commit_message=commit_message,
|
|
489
|
+
summary=summary,
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
# Replace in-memory snapshots
|
|
493
|
+
self.metadata.snapshots.append(snap)
|
|
494
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
495
|
+
|
|
496
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
497
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
498
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
499
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
500
|
+
|
|
501
|
+
def add_files(self, files: list[str], author: str = None, commit_message: Optional[str] = None):
|
|
502
|
+
"""Add filenames to the dataset manifest without writing the files.
|
|
503
|
+
|
|
504
|
+
- `files` is a list of file paths (strings). Files are assumed to
|
|
505
|
+
already exist in storage; this method only updates the manifest.
|
|
506
|
+
- Does not add files that already appear in the current manifest
|
|
507
|
+
(deduplicates by `file_path`).
|
|
508
|
+
- Creates a cumulative manifest for the new snapshot (previous
|
|
509
|
+
entries + new unique entries).
|
|
510
|
+
"""
|
|
511
|
+
if author is None:
|
|
512
|
+
raise ValueError("author must be provided when adding files to a dataset")
|
|
513
|
+
|
|
514
|
+
snapshot_id = int(time.time() * 1000)
|
|
515
|
+
|
|
516
|
+
# Gather previous summary and manifest entries
|
|
517
|
+
prev = self.snapshot(None)
|
|
518
|
+
prev_total_files = 0
|
|
519
|
+
prev_total_size = 0
|
|
520
|
+
prev_total_records = 0
|
|
521
|
+
prev_entries = []
|
|
522
|
+
if prev and prev.summary:
|
|
523
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
524
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
525
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
526
|
+
if prev and getattr(prev, "manifest_list", None):
|
|
527
|
+
# try to read prev manifest entries
|
|
528
|
+
try:
|
|
529
|
+
import pyarrow as pa
|
|
530
|
+
import pyarrow.parquet as pq
|
|
531
|
+
|
|
532
|
+
inp = self.io.new_input(prev.manifest_list)
|
|
533
|
+
with inp.open() as f:
|
|
534
|
+
data = f.read()
|
|
535
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
536
|
+
prev_entries = table.to_pylist()
|
|
537
|
+
except Exception:
|
|
538
|
+
prev_entries = []
|
|
539
|
+
|
|
540
|
+
existing = {
|
|
541
|
+
e.get("file_path") for e in prev_entries if isinstance(e, dict) and e.get("file_path")
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
# Build new entries for files that don't already exist. Only accept
|
|
545
|
+
# Parquet files and attempt to read lightweight metadata (bytes,
|
|
546
|
+
# row count, per-column min/max) from the Parquet footer when
|
|
547
|
+
# available.
|
|
548
|
+
new_entries = []
|
|
549
|
+
seen = set()
|
|
550
|
+
for fp in files:
|
|
551
|
+
if not fp or fp in existing or fp in seen:
|
|
552
|
+
continue
|
|
553
|
+
if not fp.lower().endswith(".parquet"):
|
|
554
|
+
# only accept parquet files
|
|
555
|
+
continue
|
|
556
|
+
seen.add(fp)
|
|
557
|
+
|
|
558
|
+
# Attempt to read file bytes and parquet metadata
|
|
559
|
+
# Use rugo's metadata reader which is much faster (microseconds per file)
|
|
560
|
+
try:
|
|
561
|
+
inp = self.io.new_input(fp)
|
|
562
|
+
with inp.open() as f:
|
|
563
|
+
data = f.read()
|
|
564
|
+
|
|
565
|
+
if data:
|
|
566
|
+
manifest_entry = build_parquet_manifest_minmax_entry(data, fp)
|
|
567
|
+
else:
|
|
568
|
+
# Empty file, create placeholder entry
|
|
569
|
+
manifest_entry = ParquetManifestEntry(
|
|
570
|
+
file_path=fp,
|
|
571
|
+
file_format="parquet",
|
|
572
|
+
record_count=0,
|
|
573
|
+
null_counts=[],
|
|
574
|
+
file_size_in_bytes=0,
|
|
575
|
+
uncompressed_size_in_bytes=0,
|
|
576
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
577
|
+
min_k_hashes=[],
|
|
578
|
+
histogram_counts=[],
|
|
579
|
+
histogram_bins=0,
|
|
580
|
+
min_values=[],
|
|
581
|
+
max_values=[],
|
|
582
|
+
)
|
|
583
|
+
except Exception:
|
|
584
|
+
# If metadata read fails, fall back to placeholders
|
|
585
|
+
manifest_entry = ParquetManifestEntry(
|
|
586
|
+
file_path=fp,
|
|
587
|
+
file_format="parquet",
|
|
588
|
+
record_count=0,
|
|
589
|
+
null_counts=[],
|
|
590
|
+
file_size_in_bytes=0,
|
|
591
|
+
uncompressed_size_in_bytes=0,
|
|
592
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
593
|
+
min_k_hashes=[],
|
|
594
|
+
histogram_counts=[],
|
|
595
|
+
histogram_bins=0,
|
|
596
|
+
min_values=[],
|
|
597
|
+
max_values=[],
|
|
598
|
+
)
|
|
599
|
+
new_entries.append(manifest_entry.to_dict())
|
|
600
|
+
|
|
601
|
+
merged_entries = prev_entries + new_entries
|
|
602
|
+
|
|
603
|
+
# write cumulative manifest
|
|
604
|
+
manifest_path = None
|
|
605
|
+
if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
|
|
606
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
607
|
+
snapshot_id, merged_entries, self.metadata.location
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
# Build summary deltas
|
|
611
|
+
added_data_files = len(new_entries)
|
|
612
|
+
added_files_size = 0
|
|
613
|
+
added_data_size = 0
|
|
614
|
+
added_records = 0
|
|
615
|
+
# Sum uncompressed sizes from new entries
|
|
616
|
+
for entry in new_entries:
|
|
617
|
+
added_data_size += entry.get("uncompressed_size_in_bytes", 0)
|
|
618
|
+
deleted_data_files = 0
|
|
619
|
+
deleted_files_size = 0
|
|
620
|
+
deleted_data_size = 0
|
|
621
|
+
deleted_records = 0
|
|
622
|
+
|
|
623
|
+
prev_total_data_size = (
|
|
624
|
+
int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
total_data_files = prev_total_files + added_data_files - deleted_data_files
|
|
628
|
+
total_files_size = prev_total_size + added_files_size - deleted_files_size
|
|
629
|
+
total_data_size = prev_total_data_size + added_data_size - deleted_data_size
|
|
630
|
+
total_records = prev_total_records + added_records - deleted_records
|
|
631
|
+
|
|
632
|
+
summary = {
|
|
633
|
+
"added-data-files": added_data_files,
|
|
634
|
+
"added-files-size": added_files_size,
|
|
635
|
+
"added-data-size": added_data_size,
|
|
636
|
+
"added-records": added_records,
|
|
637
|
+
"deleted-data-files": deleted_data_files,
|
|
638
|
+
"deleted-files-size": deleted_files_size,
|
|
639
|
+
"deleted-data-size": deleted_data_size,
|
|
640
|
+
"deleted-records": deleted_records,
|
|
641
|
+
"total-data-files": total_data_files,
|
|
642
|
+
"total-files-size": total_files_size,
|
|
643
|
+
"total-data-size": total_data_size,
|
|
644
|
+
"total-records": total_records,
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
# Sequence number
|
|
648
|
+
try:
|
|
649
|
+
next_seq = self._next_sequence_number()
|
|
650
|
+
except Exception:
|
|
651
|
+
next_seq = 1
|
|
652
|
+
|
|
653
|
+
parent_id = self.metadata.current_snapshot_id
|
|
654
|
+
|
|
655
|
+
if commit_message is None:
|
|
656
|
+
commit_message = f"add files by {author}"
|
|
657
|
+
|
|
658
|
+
snap = Snapshot(
|
|
659
|
+
snapshot_id=snapshot_id,
|
|
660
|
+
timestamp_ms=snapshot_id,
|
|
661
|
+
author=author,
|
|
662
|
+
sequence_number=next_seq,
|
|
663
|
+
user_created=True,
|
|
664
|
+
operation_type="add-files",
|
|
665
|
+
parent_snapshot_id=parent_id,
|
|
666
|
+
manifest_list=manifest_path,
|
|
667
|
+
schema_id=self.metadata.current_schema_id,
|
|
668
|
+
commit_message=commit_message,
|
|
669
|
+
summary=summary,
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
self.metadata.snapshots.append(snap)
|
|
673
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
674
|
+
|
|
675
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
676
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
677
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
678
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
679
|
+
|
|
680
|
+
def truncate_and_add_files(
|
|
681
|
+
self, files: list[str], author: str = None, commit_message: Optional[str] = None
|
|
682
|
+
):
|
|
683
|
+
"""Truncate dataset (logical) and set manifest to provided files.
|
|
684
|
+
|
|
685
|
+
- Writes a manifest that contains exactly the unique filenames provided.
|
|
686
|
+
- Does not delete objects from storage.
|
|
687
|
+
- Useful for replace/overwrite semantics.
|
|
688
|
+
"""
|
|
689
|
+
if author is None:
|
|
690
|
+
raise ValueError("author must be provided when truncating/adding files")
|
|
691
|
+
|
|
692
|
+
snapshot_id = int(time.time() * 1000)
|
|
693
|
+
|
|
694
|
+
# Read previous summary for reporting deleted counts
|
|
695
|
+
prev = self.snapshot(None)
|
|
696
|
+
prev_total_files = 0
|
|
697
|
+
prev_total_size = 0
|
|
698
|
+
prev_total_records = 0
|
|
699
|
+
if prev and prev.summary:
|
|
700
|
+
try:
|
|
701
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
702
|
+
except Exception:
|
|
703
|
+
prev_total_files = 0
|
|
704
|
+
try:
|
|
705
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
706
|
+
except Exception:
|
|
707
|
+
prev_total_size = 0
|
|
708
|
+
try:
|
|
709
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
710
|
+
except Exception:
|
|
711
|
+
prev_total_records = 0
|
|
712
|
+
|
|
713
|
+
# Build unique new entries (ignore duplicates in input). Only accept
|
|
714
|
+
# parquet files and try to read lightweight metadata from each file.
|
|
715
|
+
new_entries = []
|
|
716
|
+
seen = set()
|
|
717
|
+
for fp in files:
|
|
718
|
+
if not fp or fp in seen:
|
|
719
|
+
continue
|
|
720
|
+
if not fp.lower().endswith(".parquet"):
|
|
721
|
+
continue
|
|
722
|
+
seen.add(fp)
|
|
723
|
+
|
|
724
|
+
file_size = 0
|
|
725
|
+
record_count = 0
|
|
726
|
+
min_values = []
|
|
727
|
+
max_values = []
|
|
728
|
+
try:
|
|
729
|
+
import pyarrow as pa
|
|
730
|
+
import pyarrow.parquet as pq
|
|
731
|
+
|
|
732
|
+
data = None
|
|
733
|
+
if self.io and hasattr(self.io, "new_input"):
|
|
734
|
+
inp = self.io.new_input(fp)
|
|
735
|
+
with inp.open() as f:
|
|
736
|
+
data = f.read()
|
|
737
|
+
else:
|
|
738
|
+
if (
|
|
739
|
+
self.catalog
|
|
740
|
+
and getattr(self.catalog, "_storage_client", None)
|
|
741
|
+
and getattr(self.catalog, "gcs_bucket", None)
|
|
742
|
+
):
|
|
743
|
+
bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
|
|
744
|
+
parsed = fp
|
|
745
|
+
if parsed.startswith("gs://"):
|
|
746
|
+
parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
|
|
747
|
+
blob = bucket.blob(parsed)
|
|
748
|
+
data = blob.download_as_bytes()
|
|
749
|
+
|
|
750
|
+
if data:
|
|
751
|
+
file_size = len(data)
|
|
752
|
+
pf = pq.ParquetFile(pa.BufferReader(data))
|
|
753
|
+
record_count = int(pf.metadata.num_rows or 0)
|
|
754
|
+
|
|
755
|
+
ncols = pf.metadata.num_columns
|
|
756
|
+
mins = [None] * ncols
|
|
757
|
+
maxs = [None] * ncols
|
|
758
|
+
null_counts = [0] * ncols
|
|
759
|
+
for rg in range(pf.num_row_groups):
|
|
760
|
+
for ci in range(ncols):
|
|
761
|
+
col_meta = pf.metadata.row_group(rg).column(ci)
|
|
762
|
+
stats = getattr(col_meta, "statistics", None)
|
|
763
|
+
if not stats:
|
|
764
|
+
continue
|
|
765
|
+
smin = getattr(stats, "min", None)
|
|
766
|
+
smax = getattr(stats, "max", None)
|
|
767
|
+
snull_count = getattr(stats, "null_count", None)
|
|
768
|
+
if smin is None and smax is None and snull_count is None:
|
|
769
|
+
continue
|
|
770
|
+
|
|
771
|
+
def _to_py(v):
|
|
772
|
+
try:
|
|
773
|
+
return int(v)
|
|
774
|
+
except Exception:
|
|
775
|
+
try:
|
|
776
|
+
return float(v)
|
|
777
|
+
except Exception:
|
|
778
|
+
try:
|
|
779
|
+
if isinstance(v, (bytes, bytearray)):
|
|
780
|
+
return v.decode("utf-8", errors="ignore")
|
|
781
|
+
except Exception:
|
|
782
|
+
pass
|
|
783
|
+
return v
|
|
784
|
+
|
|
785
|
+
if smin is not None:
|
|
786
|
+
sval = _to_py(smin)
|
|
787
|
+
if mins[ci] is None:
|
|
788
|
+
mins[ci] = sval
|
|
789
|
+
else:
|
|
790
|
+
try:
|
|
791
|
+
if sval < mins[ci]:
|
|
792
|
+
mins[ci] = sval
|
|
793
|
+
except Exception:
|
|
794
|
+
pass
|
|
795
|
+
if smax is not None:
|
|
796
|
+
sval = _to_py(smax)
|
|
797
|
+
if maxs[ci] is None:
|
|
798
|
+
maxs[ci] = sval
|
|
799
|
+
else:
|
|
800
|
+
try:
|
|
801
|
+
if sval > maxs[ci]:
|
|
802
|
+
maxs[ci] = sval
|
|
803
|
+
except Exception:
|
|
804
|
+
pass
|
|
805
|
+
if snull_count is not None:
|
|
806
|
+
try:
|
|
807
|
+
null_counts[ci] += int(snull_count)
|
|
808
|
+
except Exception:
|
|
809
|
+
pass
|
|
810
|
+
|
|
811
|
+
min_values = [m for m in mins if m is not None]
|
|
812
|
+
max_values = [m for m in maxs if m is not None]
|
|
813
|
+
except Exception:
|
|
814
|
+
file_size = 0
|
|
815
|
+
record_count = 0
|
|
816
|
+
min_values = []
|
|
817
|
+
max_values = []
|
|
818
|
+
null_counts = []
|
|
819
|
+
|
|
820
|
+
manifest_entry = ParquetManifestEntry(
|
|
821
|
+
file_path=fp,
|
|
822
|
+
file_format="parquet",
|
|
823
|
+
record_count=int(record_count),
|
|
824
|
+
null_counts=null_counts,
|
|
825
|
+
file_size_in_bytes=int(file_size),
|
|
826
|
+
uncompressed_size_in_bytes=int(file_size), # Use compressed size as estimate
|
|
827
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
828
|
+
min_k_hashes=[],
|
|
829
|
+
histogram_counts=[],
|
|
830
|
+
histogram_bins=0,
|
|
831
|
+
min_values=min_values,
|
|
832
|
+
max_values=max_values,
|
|
833
|
+
)
|
|
834
|
+
new_entries.append(manifest_entry.to_dict())
|
|
835
|
+
|
|
836
|
+
manifest_path = None
|
|
837
|
+
if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
|
|
838
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
839
|
+
snapshot_id, new_entries, self.metadata.location
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
# Build summary: previous entries become deleted
|
|
843
|
+
deleted_data_files = prev_total_files
|
|
844
|
+
deleted_files_size = prev_total_size
|
|
845
|
+
deleted_data_size = (
|
|
846
|
+
int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
|
|
847
|
+
)
|
|
848
|
+
deleted_records = prev_total_records
|
|
849
|
+
|
|
850
|
+
added_data_files = len(new_entries)
|
|
851
|
+
added_files_size = 0
|
|
852
|
+
added_data_size = 0
|
|
853
|
+
# Sum uncompressed sizes from new entries
|
|
854
|
+
for entry in new_entries:
|
|
855
|
+
added_data_size += entry.get("uncompressed_size_in_bytes", 0)
|
|
856
|
+
added_records = 0
|
|
857
|
+
|
|
858
|
+
total_data_files = added_data_files
|
|
859
|
+
total_files_size = added_files_size
|
|
860
|
+
total_data_size = added_data_size
|
|
861
|
+
total_records = added_records
|
|
862
|
+
|
|
863
|
+
summary = {
|
|
864
|
+
"added-data-files": added_data_files,
|
|
865
|
+
"added-files-size": added_files_size,
|
|
866
|
+
"added-data-size": added_data_size,
|
|
867
|
+
"added-records": added_records,
|
|
868
|
+
"deleted-data-files": deleted_data_files,
|
|
869
|
+
"deleted-files-size": deleted_files_size,
|
|
870
|
+
"deleted-data-size": deleted_data_size,
|
|
871
|
+
"deleted-records": deleted_records,
|
|
872
|
+
"total-data-files": total_data_files,
|
|
873
|
+
"total-files-size": total_files_size,
|
|
874
|
+
"total-data-size": total_data_size,
|
|
875
|
+
"total-records": total_records,
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
# Sequence number
|
|
879
|
+
try:
|
|
880
|
+
next_seq = self._next_sequence_number()
|
|
881
|
+
except Exception:
|
|
882
|
+
next_seq = 1
|
|
883
|
+
|
|
884
|
+
parent_id = self.metadata.current_snapshot_id
|
|
885
|
+
|
|
886
|
+
if commit_message is None:
|
|
887
|
+
commit_message = f"truncate and add files by {author}"
|
|
888
|
+
|
|
889
|
+
snap = Snapshot(
|
|
890
|
+
snapshot_id=snapshot_id,
|
|
891
|
+
timestamp_ms=snapshot_id,
|
|
892
|
+
author=author,
|
|
893
|
+
sequence_number=next_seq,
|
|
894
|
+
user_created=True,
|
|
895
|
+
operation_type="truncate-and-add-files",
|
|
896
|
+
parent_snapshot_id=parent_id,
|
|
897
|
+
manifest_list=manifest_path,
|
|
898
|
+
schema_id=self.metadata.current_schema_id,
|
|
899
|
+
commit_message=commit_message,
|
|
900
|
+
summary=summary,
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
# Replace in-memory snapshots: append snapshot and update current id
|
|
904
|
+
self.metadata.snapshots.append(snap)
|
|
905
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
906
|
+
|
|
907
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
908
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
909
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
910
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
911
|
+
|
|
912
|
+
def scan(self, row_filter=None, snapshot_id: Optional[int] = None) -> Iterable[Datafile]:
|
|
913
|
+
"""Return Datafile objects for the given snapshot.
|
|
914
|
+
|
|
915
|
+
- If `snapshot_id` is None, use the current snapshot.
|
|
916
|
+
"""
|
|
917
|
+
# Determine snapshot to read using the dataset-level helper which
|
|
918
|
+
# prefers the in-memory current snapshot and otherwise performs a
|
|
919
|
+
# backend lookup for the requested id.
|
|
920
|
+
snap = self.snapshot(snapshot_id)
|
|
921
|
+
|
|
922
|
+
if snap is None or not getattr(snap, "manifest_list", None):
|
|
923
|
+
return iter(())
|
|
924
|
+
|
|
925
|
+
manifest_path = snap.manifest_list
|
|
926
|
+
|
|
927
|
+
# Read manifest via FileIO if available
|
|
928
|
+
try:
|
|
929
|
+
import pyarrow as pa
|
|
930
|
+
import pyarrow.parquet as pq
|
|
931
|
+
|
|
932
|
+
inp = self.io.new_input(manifest_path)
|
|
933
|
+
with inp.open() as f:
|
|
934
|
+
data = f.read()
|
|
935
|
+
|
|
936
|
+
if not data:
|
|
937
|
+
return iter(())
|
|
938
|
+
|
|
939
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
940
|
+
rows = table.to_pylist()
|
|
941
|
+
for r in rows:
|
|
942
|
+
yield Datafile(entry=r)
|
|
943
|
+
except FileNotFoundError:
|
|
944
|
+
return iter(())
|
|
945
|
+
except Exception:
|
|
946
|
+
return iter(())
|
|
947
|
+
|
|
948
|
+
def refresh_manifest(self, agent: str, author: Optional[str] = None) -> Optional[int]:
|
|
949
|
+
"""Refresh manifest statistics and create a new snapshot.
|
|
950
|
+
|
|
951
|
+
- `agent`: identifier for the agent performing the refresh (string)
|
|
952
|
+
- `author`: optional author to record; if omitted uses current snapshot author
|
|
953
|
+
|
|
954
|
+
This recalculates per-file statistics (min/max, record counts, sizes)
|
|
955
|
+
for every file in the current manifest, writes a new manifest and
|
|
956
|
+
creates a new snapshot with `user_created=False` and
|
|
957
|
+
`operation_type='statistics-refresh'`.
|
|
958
|
+
|
|
959
|
+
Returns the new `snapshot_id` on success or None on failure.
|
|
960
|
+
"""
|
|
961
|
+
prev = self.snapshot(None)
|
|
962
|
+
if prev is None or not getattr(prev, "manifest_list", None):
|
|
963
|
+
raise ValueError("No current manifest available to refresh")
|
|
964
|
+
|
|
965
|
+
# Use same author/commit-timestamp as previous snapshot unless overridden
|
|
966
|
+
use_author = author if author is not None else getattr(prev, "author", None)
|
|
967
|
+
|
|
968
|
+
snapshot_id = int(time.time() * 1000)
|
|
969
|
+
|
|
970
|
+
# Rebuild manifest entries by re-reading each data file
|
|
971
|
+
entries = []
|
|
972
|
+
try:
|
|
973
|
+
# Read previous manifest entries
|
|
974
|
+
inp = self.io.new_input(prev.manifest_list)
|
|
975
|
+
with inp.open() as f:
|
|
976
|
+
prev_data = f.read()
|
|
977
|
+
import pyarrow as pa
|
|
978
|
+
import pyarrow.parquet as pq
|
|
979
|
+
|
|
980
|
+
prev_table = pq.read_table(pa.BufferReader(prev_data))
|
|
981
|
+
prev_rows = prev_table.to_pylist()
|
|
982
|
+
except Exception:
|
|
983
|
+
prev_rows = []
|
|
984
|
+
|
|
985
|
+
total_files = 0
|
|
986
|
+
total_size = 0
|
|
987
|
+
total_data_size = 0
|
|
988
|
+
total_records = 0
|
|
989
|
+
|
|
990
|
+
for ent in prev_rows:
|
|
991
|
+
if not isinstance(ent, dict):
|
|
992
|
+
continue
|
|
993
|
+
fp = ent.get("file_path")
|
|
994
|
+
if not fp:
|
|
995
|
+
continue
|
|
996
|
+
try:
|
|
997
|
+
inp = self.io.new_input(fp)
|
|
998
|
+
with inp.open() as f:
|
|
999
|
+
data = f.read()
|
|
1000
|
+
# Full statistics including histograms and k-hashes
|
|
1001
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
1002
|
+
manifest_entry = build_parquet_manifest_entry(table, fp, len(data))
|
|
1003
|
+
dent = manifest_entry.to_dict()
|
|
1004
|
+
except Exception:
|
|
1005
|
+
# Fall back to original entry if re-read fails
|
|
1006
|
+
dent = ent
|
|
1007
|
+
|
|
1008
|
+
entries.append(dent)
|
|
1009
|
+
total_files += 1
|
|
1010
|
+
total_size += int(dent.get("file_size_in_bytes") or 0)
|
|
1011
|
+
total_data_size += int(dent.get("uncompressed_size_in_bytes") or 0)
|
|
1012
|
+
total_records += int(dent.get("record_count") or 0)
|
|
1013
|
+
|
|
1014
|
+
# write new manifest
|
|
1015
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
1016
|
+
snapshot_id, entries, self.metadata.location
|
|
1017
|
+
)
|
|
1018
|
+
|
|
1019
|
+
# Build summary
|
|
1020
|
+
summary = {
|
|
1021
|
+
"added-data-files": 0,
|
|
1022
|
+
"added-files-size": 0,
|
|
1023
|
+
"added-data-size": 0,
|
|
1024
|
+
"added-records": 0,
|
|
1025
|
+
"deleted-data-files": 0,
|
|
1026
|
+
"deleted-files-size": 0,
|
|
1027
|
+
"deleted-data-size": 0,
|
|
1028
|
+
"deleted-records": 0,
|
|
1029
|
+
"total-data-files": total_files,
|
|
1030
|
+
"total-files-size": total_size,
|
|
1031
|
+
"total-data-size": total_data_size,
|
|
1032
|
+
"total-records": total_records,
|
|
1033
|
+
}
|
|
1034
|
+
|
|
1035
|
+
# sequence number
|
|
1036
|
+
try:
|
|
1037
|
+
next_seq = self._next_sequence_number()
|
|
1038
|
+
except Exception:
|
|
1039
|
+
next_seq = 1
|
|
1040
|
+
|
|
1041
|
+
parent_id = self.metadata.current_snapshot_id
|
|
1042
|
+
|
|
1043
|
+
# Agent committer metadata
|
|
1044
|
+
agent_meta = {
|
|
1045
|
+
"timestamp": int(time.time() * 1000),
|
|
1046
|
+
"action": "statistics-refresh",
|
|
1047
|
+
"agent": agent,
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
snap = Snapshot(
|
|
1051
|
+
snapshot_id=snapshot_id,
|
|
1052
|
+
timestamp_ms=getattr(prev, "timestamp_ms", snapshot_id),
|
|
1053
|
+
author=use_author,
|
|
1054
|
+
sequence_number=next_seq,
|
|
1055
|
+
user_created=False,
|
|
1056
|
+
operation_type="statistics-refresh",
|
|
1057
|
+
parent_snapshot_id=parent_id,
|
|
1058
|
+
manifest_list=manifest_path,
|
|
1059
|
+
schema_id=self.metadata.current_schema_id,
|
|
1060
|
+
commit_message=getattr(prev, "commit_message", "statistics refresh"),
|
|
1061
|
+
summary=summary,
|
|
1062
|
+
)
|
|
1063
|
+
|
|
1064
|
+
# attach agent metadata under summary
|
|
1065
|
+
if snap.summary is None:
|
|
1066
|
+
snap.summary = {}
|
|
1067
|
+
snap.summary["agent-committer"] = agent_meta
|
|
1068
|
+
|
|
1069
|
+
# update in-memory metadata
|
|
1070
|
+
self.metadata.snapshots.append(snap)
|
|
1071
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
1072
|
+
|
|
1073
|
+
# persist
|
|
1074
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
1075
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
1076
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
1077
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
1078
|
+
|
|
1079
|
+
return snapshot_id
|
|
1080
|
+
|
|
1081
|
+
def truncate(self, author: str = None, commit_message: Optional[str] = None) -> None:
|
|
1082
|
+
"""Delete all data files and manifests for this dataset.
|
|
1083
|
+
|
|
1084
|
+
This attempts to delete every data file referenced by existing
|
|
1085
|
+
Parquet manifests and then delete the manifest files themselves.
|
|
1086
|
+
Finally it clears the in-memory snapshot list and persists the
|
|
1087
|
+
empty snapshot set via the attached `catalog` (if available).
|
|
1088
|
+
"""
|
|
1089
|
+
import pyarrow as pa
|
|
1090
|
+
import pyarrow.parquet as pq
|
|
1091
|
+
|
|
1092
|
+
io = self.io
|
|
1093
|
+
# Collect files referenced by existing manifests but do NOT delete
|
|
1094
|
+
# them from storage. Instead we will write a new empty manifest and
|
|
1095
|
+
# create a truncate snapshot that records these files as deleted.
|
|
1096
|
+
snaps = list(self.metadata.snapshots)
|
|
1097
|
+
removed_files = []
|
|
1098
|
+
removed_total_size = 0
|
|
1099
|
+
removed_data_size = 0
|
|
1100
|
+
|
|
1101
|
+
for snap in snaps:
|
|
1102
|
+
manifest_path = getattr(snap, "manifest_list", None)
|
|
1103
|
+
if not manifest_path:
|
|
1104
|
+
continue
|
|
1105
|
+
|
|
1106
|
+
# Read manifest via FileIO if available
|
|
1107
|
+
rows = []
|
|
1108
|
+
try:
|
|
1109
|
+
inp = io.new_input(manifest_path)
|
|
1110
|
+
with inp.open() as f:
|
|
1111
|
+
data = f.read()
|
|
1112
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
1113
|
+
rows = table.to_pylist()
|
|
1114
|
+
except Exception:
|
|
1115
|
+
rows = []
|
|
1116
|
+
|
|
1117
|
+
for r in rows:
|
|
1118
|
+
fp = None
|
|
1119
|
+
fsize = 0
|
|
1120
|
+
data_size = 0
|
|
1121
|
+
if isinstance(r, dict):
|
|
1122
|
+
fp = r.get("file_path")
|
|
1123
|
+
fsize = int(r.get("file_size_in_bytes") or 0)
|
|
1124
|
+
data_size = int(r.get("uncompressed_size_in_bytes") or 0)
|
|
1125
|
+
if not fp and "data_file" in r and isinstance(r["data_file"], dict):
|
|
1126
|
+
fp = r["data_file"].get("file_path") or r["data_file"].get("path")
|
|
1127
|
+
fsize = int(r["data_file"].get("file_size_in_bytes") or 0)
|
|
1128
|
+
data_size = int(r["data_file"].get("uncompressed_size_in_bytes") or 0)
|
|
1129
|
+
|
|
1130
|
+
if fp:
|
|
1131
|
+
removed_files.append(fp)
|
|
1132
|
+
removed_total_size += fsize
|
|
1133
|
+
removed_data_size += data_size
|
|
1134
|
+
|
|
1135
|
+
# Create a new empty Parquet manifest (entries=[]) to represent the
|
|
1136
|
+
# truncated dataset for the new snapshot. Do not delete objects.
|
|
1137
|
+
snapshot_id = int(time.time() * 1000)
|
|
1138
|
+
|
|
1139
|
+
# Do NOT write an empty Parquet manifest when there are no entries.
|
|
1140
|
+
# Per policy, create the snapshot without a manifest so older
|
|
1141
|
+
# snapshots remain readable and we avoid creating empty manifest files.
|
|
1142
|
+
manifest_path = None
|
|
1143
|
+
|
|
1144
|
+
# Build summary reflecting deleted files (tracked, not removed)
|
|
1145
|
+
deleted_count = len(removed_files)
|
|
1146
|
+
deleted_size = removed_total_size
|
|
1147
|
+
|
|
1148
|
+
summary = {
|
|
1149
|
+
"added-data-files": 0,
|
|
1150
|
+
"added-files-size": 0,
|
|
1151
|
+
"added-data-size": 0,
|
|
1152
|
+
"added-records": 0,
|
|
1153
|
+
"deleted-data-files": deleted_count,
|
|
1154
|
+
"deleted-files-size": deleted_size,
|
|
1155
|
+
"deleted-data-size": removed_data_size,
|
|
1156
|
+
"deleted-records": 0,
|
|
1157
|
+
"total-data-files": 0,
|
|
1158
|
+
"total-files-size": 0,
|
|
1159
|
+
"total-data-size": 0,
|
|
1160
|
+
"total-records": 0,
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
# Sequence number
|
|
1164
|
+
try:
|
|
1165
|
+
next_seq = self._next_sequence_number()
|
|
1166
|
+
except Exception:
|
|
1167
|
+
next_seq = 1
|
|
1168
|
+
|
|
1169
|
+
if author is None:
|
|
1170
|
+
raise ValueError(
|
|
1171
|
+
"truncate() must be called with an explicit author; use truncate(author=...) in caller"
|
|
1172
|
+
)
|
|
1173
|
+
# update metadata author/timestamp for this truncate
|
|
1174
|
+
self.metadata.author = author
|
|
1175
|
+
self.metadata.timestamp_ms = snapshot_id
|
|
1176
|
+
# default commit message
|
|
1177
|
+
if commit_message is None:
|
|
1178
|
+
commit_message = f"commit by {author}"
|
|
1179
|
+
|
|
1180
|
+
parent_id = self.metadata.current_snapshot_id
|
|
1181
|
+
|
|
1182
|
+
snap = Snapshot(
|
|
1183
|
+
snapshot_id=snapshot_id,
|
|
1184
|
+
timestamp_ms=snapshot_id,
|
|
1185
|
+
author=author,
|
|
1186
|
+
sequence_number=next_seq,
|
|
1187
|
+
user_created=True,
|
|
1188
|
+
operation_type="truncate",
|
|
1189
|
+
parent_snapshot_id=parent_id,
|
|
1190
|
+
manifest_list=manifest_path,
|
|
1191
|
+
schema_id=self.metadata.current_schema_id,
|
|
1192
|
+
commit_message=commit_message,
|
|
1193
|
+
summary=summary,
|
|
1194
|
+
)
|
|
1195
|
+
|
|
1196
|
+
# Append new snapshot and update current snapshot id
|
|
1197
|
+
self.metadata.snapshots.append(snap)
|
|
1198
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
1199
|
+
|
|
1200
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
1201
|
+
self.catalog.save_snapshot(self.identifier, snap)
|