opteryx-catalog 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/__init__.py +31 -0
- opteryx_catalog/catalog/__init__.py +3 -0
- opteryx_catalog/catalog/dataset.py +1221 -0
- opteryx_catalog/catalog/manifest.py +23 -0
- opteryx_catalog/catalog/metadata.py +81 -0
- opteryx_catalog/catalog/metastore.py +68 -0
- opteryx_catalog/catalog/view.py +12 -0
- opteryx_catalog/exceptions.py +38 -0
- opteryx_catalog/iops/__init__.py +6 -0
- opteryx_catalog/iops/base.py +42 -0
- opteryx_catalog/iops/fileio.py +125 -0
- opteryx_catalog/iops/gcs.py +225 -0
- opteryx_catalog/opteryx_catalog.py +923 -0
- opteryx_catalog-0.4.4.dist-info/METADATA +464 -0
- opteryx_catalog-0.4.4.dist-info/RECORD +23 -0
- opteryx_catalog-0.4.4.dist-info/WHEEL +5 -0
- opteryx_catalog-0.4.4.dist-info/licenses/LICENSE +201 -0
- opteryx_catalog-0.4.4.dist-info/top_level.txt +3 -0
- scripts/create_dataset.py +201 -0
- scripts/read_dataset.py +268 -0
- tests/test_dataset_metadata.py +15 -0
- tests/test_import.py +5 -0
- tests/test_pyproject.py +8 -0
|
@@ -0,0 +1,1221 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any
|
|
8
|
+
from typing import Iterable
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from .metadata import DatasetMetadata
|
|
12
|
+
from .metadata import Snapshot
|
|
13
|
+
from .metastore import Dataset
|
|
14
|
+
|
|
15
|
+
# Stable node identifier for this process (hex-mac-hex-pid)
|
|
16
|
+
_NODE = f"{uuid.getnode():x}-{os.getpid():x}"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class Datafile:
|
|
21
|
+
"""Wrapper for a manifest entry representing a data file."""
|
|
22
|
+
|
|
23
|
+
entry: dict
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def file_path(self) -> Optional[str]:
|
|
27
|
+
return self.entry.get("file_path")
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def record_count(self) -> int:
|
|
31
|
+
return int(self.entry.get("record_count") or 0)
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def file_size_in_bytes(self) -> int:
|
|
35
|
+
return int(self.entry.get("file_size_in_bytes") or 0)
|
|
36
|
+
|
|
37
|
+
def to_dict(self) -> dict:
|
|
38
|
+
return dict(self.entry)
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def min_k_hashes(self) -> list:
|
|
42
|
+
return self.entry.get("min_k_hashes") or []
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def histogram_counts(self) -> list:
|
|
46
|
+
return self.entry.get("histogram_counts") or []
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def histogram_bins(self) -> int:
|
|
50
|
+
return int(self.entry.get("histogram_bins") or 0)
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def min_values(self) -> list:
|
|
54
|
+
return self.entry.get("min_values") or []
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def max_values(self) -> list:
|
|
58
|
+
return self.entry.get("max_values") or []
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class SimpleDataset(Dataset):
|
|
63
|
+
identifier: str
|
|
64
|
+
_metadata: DatasetMetadata
|
|
65
|
+
io: Any = None
|
|
66
|
+
catalog: Any = None
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def metadata(self) -> DatasetMetadata:
|
|
70
|
+
return self._metadata
|
|
71
|
+
|
|
72
|
+
def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Snapshot]:
|
|
73
|
+
"""Return a Snapshot.
|
|
74
|
+
|
|
75
|
+
- If `snapshot_id` is None, return the in-memory current snapshot.
|
|
76
|
+
- If a `snapshot_id` is provided, prefer a Firestore lookup via the
|
|
77
|
+
attached `catalog` (O(1) document get). Fall back to the in-memory
|
|
78
|
+
`metadata.snapshots` list only when no catalog is attached or the
|
|
79
|
+
remote lookup fails.
|
|
80
|
+
"""
|
|
81
|
+
# Current snapshot: keep in memory for fast access
|
|
82
|
+
if snapshot_id is None:
|
|
83
|
+
return self.metadata.current_snapshot()
|
|
84
|
+
|
|
85
|
+
# Try Firestore document lookup when catalog attached
|
|
86
|
+
if self.catalog:
|
|
87
|
+
try:
|
|
88
|
+
collection, dataset_name = self.identifier.split(".")
|
|
89
|
+
doc = (
|
|
90
|
+
self.catalog._dataset_doc_ref(collection, dataset_name)
|
|
91
|
+
.collection("snapshots")
|
|
92
|
+
.document(str(snapshot_id))
|
|
93
|
+
.get()
|
|
94
|
+
)
|
|
95
|
+
if doc.exists:
|
|
96
|
+
sd = doc.to_dict() or {}
|
|
97
|
+
snap = Snapshot(
|
|
98
|
+
snapshot_id=int(
|
|
99
|
+
sd.get("snapshot-id") or sd.get("snapshot_id") or snapshot_id
|
|
100
|
+
),
|
|
101
|
+
timestamp_ms=int(sd.get("timestamp-ms") or sd.get("timestamp_ms") or 0),
|
|
102
|
+
author=sd.get("author"),
|
|
103
|
+
sequence_number=sd.get("sequence-number") or sd.get("sequence_number"),
|
|
104
|
+
user_created=sd.get("user-created") or sd.get("user_created"),
|
|
105
|
+
manifest_list=sd.get("manifest") or sd.get("manifest_list"),
|
|
106
|
+
schema_id=sd.get("schema-id") or sd.get("schema_id"),
|
|
107
|
+
summary=sd.get("summary", {}),
|
|
108
|
+
operation_type=sd.get("operation-type") or sd.get("operation_type"),
|
|
109
|
+
parent_snapshot_id=sd.get("parent-snapshot-id")
|
|
110
|
+
or sd.get("parent_snapshot_id"),
|
|
111
|
+
commit_message=sd.get("commit-message") or sd.get("commit_message"),
|
|
112
|
+
)
|
|
113
|
+
return snap
|
|
114
|
+
except Exception:
|
|
115
|
+
# Be conservative: fall through to in-memory fallback
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
# Fallback: search in-memory snapshots (only used when no catalog)
|
|
119
|
+
for s in self.metadata.snapshots:
|
|
120
|
+
if s.snapshot_id == snapshot_id:
|
|
121
|
+
return s
|
|
122
|
+
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
def _get_node(self) -> str:
|
|
126
|
+
"""Return the stable node identifier for this process.
|
|
127
|
+
|
|
128
|
+
Uses a module-level constant to avoid per-instance hashing/caching.
|
|
129
|
+
"""
|
|
130
|
+
return _NODE
|
|
131
|
+
|
|
132
|
+
def snapshots(self) -> Iterable[Snapshot]:
|
|
133
|
+
return list(self.metadata.snapshots)
|
|
134
|
+
|
|
135
|
+
def schema(self, schema_id: Optional[str] = None) -> Optional[dict]:
|
|
136
|
+
"""Return a stored schema description.
|
|
137
|
+
|
|
138
|
+
If `schema_id` is None, return the current schema (by
|
|
139
|
+
`metadata.current_schema_id` or last-known schema). If a
|
|
140
|
+
specific `schema_id` is provided, attempt to find it in the
|
|
141
|
+
in-memory `metadata.schemas` list and, failing that, fetch it
|
|
142
|
+
from the catalog's `schemas` subcollection when a catalog is
|
|
143
|
+
attached.
|
|
144
|
+
|
|
145
|
+
Returns the stored schema dict (contains keys like `schema_id`,
|
|
146
|
+
`columns`, `timestamp-ms`, etc.) or None if not found.
|
|
147
|
+
"""
|
|
148
|
+
# Determine which schema id to use
|
|
149
|
+
sid = schema_id or self.metadata.current_schema_id
|
|
150
|
+
|
|
151
|
+
# If no sid and a raw schema is stored on the metadata, return it
|
|
152
|
+
if sid is None:
|
|
153
|
+
return getattr(self.metadata, "schema", None)
|
|
154
|
+
|
|
155
|
+
# Fast path: if this is the current schema id, prefer the cached
|
|
156
|
+
# current schema (99% case) rather than scanning the entire list.
|
|
157
|
+
sdict = None
|
|
158
|
+
if sid == self.metadata.current_schema_id:
|
|
159
|
+
if getattr(self.metadata, "schemas", None):
|
|
160
|
+
last = self.metadata.schemas[-1]
|
|
161
|
+
if last.get("schema_id") == sid:
|
|
162
|
+
sdict = last
|
|
163
|
+
else:
|
|
164
|
+
# If a raw schema is stored directly on metadata, use it.
|
|
165
|
+
raw = getattr(self.metadata, "schema", None)
|
|
166
|
+
if raw is not None:
|
|
167
|
+
sdict = {"schema_id": sid, "columns": raw}
|
|
168
|
+
|
|
169
|
+
# If not the current schema, or cached current not present,
|
|
170
|
+
# prefer to load the schema document from the backend (O(1) doc get).
|
|
171
|
+
if sdict is None and self.catalog:
|
|
172
|
+
try:
|
|
173
|
+
collection, dataset_name = self.identifier.split(".")
|
|
174
|
+
doc = (
|
|
175
|
+
self.catalog._dataset_doc_ref(collection, dataset_name)
|
|
176
|
+
.collection("schemas")
|
|
177
|
+
.document(sid)
|
|
178
|
+
.get()
|
|
179
|
+
)
|
|
180
|
+
sdict = doc.to_dict() or None
|
|
181
|
+
except Exception:
|
|
182
|
+
sdict = None
|
|
183
|
+
|
|
184
|
+
# As a last-resort when no catalog is attached, fall back to an
|
|
185
|
+
# in-memory search for compatibility (offline/unit-test mode).
|
|
186
|
+
if sdict is None and not self.catalog:
|
|
187
|
+
for s in self.metadata.schemas or []:
|
|
188
|
+
if s.get("schema_id") == sid:
|
|
189
|
+
sdict = s
|
|
190
|
+
break
|
|
191
|
+
|
|
192
|
+
if sdict is None:
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
# Try to construct an Orso RelationSchema
|
|
196
|
+
from orso.schema import FlatColumn
|
|
197
|
+
from orso.schema import RelationSchema
|
|
198
|
+
|
|
199
|
+
# If metadata stored a raw schema
|
|
200
|
+
raw = sdict.get("columns")
|
|
201
|
+
|
|
202
|
+
columns = [
|
|
203
|
+
FlatColumn(
|
|
204
|
+
name=c.get("name"),
|
|
205
|
+
type=c.get("type"),
|
|
206
|
+
element_type=c.get("element-type"),
|
|
207
|
+
precision=c.get("precision"),
|
|
208
|
+
scale=c.get("scale"),
|
|
209
|
+
)
|
|
210
|
+
for c in raw
|
|
211
|
+
]
|
|
212
|
+
orso_schema = RelationSchema(name=self.identifier, columns=columns)
|
|
213
|
+
return orso_schema
|
|
214
|
+
|
|
215
|
+
def append(self, table: Any, author: str = None, commit_message: Optional[str] = None):
|
|
216
|
+
"""Append a pyarrow.Table:
|
|
217
|
+
|
|
218
|
+
- write a Parquet data file via `self.io`
|
|
219
|
+
- create a simple Parquet manifest (one entry)
|
|
220
|
+
- persist manifest and snapshot metadata using the attached `catalog`
|
|
221
|
+
"""
|
|
222
|
+
import pyarrow as pa
|
|
223
|
+
import pyarrow.parquet as pq
|
|
224
|
+
|
|
225
|
+
snapshot_id = int(time.time() * 1000)
|
|
226
|
+
|
|
227
|
+
if not hasattr(table, "schema"):
|
|
228
|
+
raise TypeError("append() expects a pyarrow.Table-like object")
|
|
229
|
+
|
|
230
|
+
# Write parquet file with collision-resistant name
|
|
231
|
+
fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
|
|
232
|
+
data_path = f"{self.metadata.location}/data/{fname}"
|
|
233
|
+
buf = pa.BufferOutputStream()
|
|
234
|
+
pq.write_table(table, buf, compression="zstd")
|
|
235
|
+
pdata = buf.getvalue().to_pybytes()
|
|
236
|
+
|
|
237
|
+
out = self.io.new_output(data_path).create()
|
|
238
|
+
out.write(pdata)
|
|
239
|
+
out.close()
|
|
240
|
+
|
|
241
|
+
# Prepare sketches/stats
|
|
242
|
+
K = 32
|
|
243
|
+
HBINS = 32
|
|
244
|
+
min_k_hashes: list[list[int]] = []
|
|
245
|
+
histograms: list[list[int]] = []
|
|
246
|
+
min_values: list[int] = []
|
|
247
|
+
max_values: list[int] = []
|
|
248
|
+
|
|
249
|
+
# Use draken for efficient hashing and compression when available.
|
|
250
|
+
import heapq
|
|
251
|
+
|
|
252
|
+
# canonical NULL flag for missing values
|
|
253
|
+
NULL_FLAG = -(1 << 63)
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
import opteryx.draken as draken # type: ignore
|
|
257
|
+
|
|
258
|
+
num_rows = int(table.num_rows)
|
|
259
|
+
|
|
260
|
+
for col_idx, col in enumerate(table.columns):
|
|
261
|
+
# hash column values to 64-bit via draken (new cpdef API)
|
|
262
|
+
vec = draken.Vector.from_arrow(col)
|
|
263
|
+
hashes = list(vec.hash())
|
|
264
|
+
|
|
265
|
+
# Decide whether to compute min-k/histogram for this column based
|
|
266
|
+
# on field type and, for strings, average length of values.
|
|
267
|
+
field_type = table.schema.field(col_idx).type
|
|
268
|
+
compute_min_k = False
|
|
269
|
+
if (
|
|
270
|
+
pa.types.is_integer(field_type)
|
|
271
|
+
or pa.types.is_floating(field_type)
|
|
272
|
+
or pa.types.is_decimal(field_type)
|
|
273
|
+
):
|
|
274
|
+
compute_min_k = True
|
|
275
|
+
elif (
|
|
276
|
+
pa.types.is_timestamp(field_type)
|
|
277
|
+
or pa.types.is_date(field_type)
|
|
278
|
+
or pa.types.is_time(field_type)
|
|
279
|
+
):
|
|
280
|
+
compute_min_k = True
|
|
281
|
+
elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
|
|
282
|
+
# compute average length from non-null values; only allow
|
|
283
|
+
# min-k/histogram for short strings (avg <= 16)
|
|
284
|
+
col_py = None
|
|
285
|
+
try:
|
|
286
|
+
col_py = col.to_pylist()
|
|
287
|
+
except Exception:
|
|
288
|
+
col_py = None
|
|
289
|
+
|
|
290
|
+
if col_py is not None:
|
|
291
|
+
lens = [len(x) for x in col_py if x is not None]
|
|
292
|
+
if lens:
|
|
293
|
+
avg_len = sum(lens) / len(lens)
|
|
294
|
+
if avg_len <= 16:
|
|
295
|
+
compute_min_k = True
|
|
296
|
+
|
|
297
|
+
# KMV: take K smallest hashes when allowed; otherwise store an
|
|
298
|
+
# empty list for this column.
|
|
299
|
+
if compute_min_k:
|
|
300
|
+
smallest = heapq.nsmallest(K, hashes)
|
|
301
|
+
col_min_k = sorted(smallest)
|
|
302
|
+
else:
|
|
303
|
+
col_min_k = []
|
|
304
|
+
|
|
305
|
+
# For histogram decisions follow the same rule as min-k
|
|
306
|
+
compute_hist = compute_min_k
|
|
307
|
+
|
|
308
|
+
# Use draken.compress() to get canonical int64 per value
|
|
309
|
+
mapped = list(vec.compress())
|
|
310
|
+
non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
|
|
311
|
+
if non_nulls_mapped:
|
|
312
|
+
vmin = min(non_nulls_mapped)
|
|
313
|
+
vmax = max(non_nulls_mapped)
|
|
314
|
+
col_min = int(vmin)
|
|
315
|
+
col_max = int(vmax)
|
|
316
|
+
if compute_hist:
|
|
317
|
+
if vmin == vmax:
|
|
318
|
+
col_hist = [0] * HBINS
|
|
319
|
+
col_hist[-1] = len(non_nulls_mapped)
|
|
320
|
+
else:
|
|
321
|
+
col_hist = [0] * HBINS
|
|
322
|
+
span = float(vmax - vmin)
|
|
323
|
+
for m in non_nulls_mapped:
|
|
324
|
+
b = int(((float(m) - float(vmin)) / span) * (HBINS - 1))
|
|
325
|
+
if b < 0:
|
|
326
|
+
b = 0
|
|
327
|
+
if b >= HBINS:
|
|
328
|
+
b = HBINS - 1
|
|
329
|
+
col_hist[b] += 1
|
|
330
|
+
else:
|
|
331
|
+
col_hist = [0] * HBINS
|
|
332
|
+
else:
|
|
333
|
+
# no non-null values; histogram via hash buckets
|
|
334
|
+
col_min = NULL_FLAG
|
|
335
|
+
col_max = NULL_FLAG
|
|
336
|
+
if compute_hist:
|
|
337
|
+
col_hist = [0] * HBINS
|
|
338
|
+
for h in hashes:
|
|
339
|
+
b = (h >> (64 - 5)) & 0x1F
|
|
340
|
+
col_hist[b] += 1
|
|
341
|
+
else:
|
|
342
|
+
col_hist = [0] * HBINS
|
|
343
|
+
|
|
344
|
+
min_k_hashes.append(col_min_k)
|
|
345
|
+
histograms.append(col_hist)
|
|
346
|
+
min_values.append(col_min)
|
|
347
|
+
max_values.append(col_max)
|
|
348
|
+
except Exception:
|
|
349
|
+
# If draken or its dependencies are unavailable, fall back to
|
|
350
|
+
# conservative defaults so we can still write the manifest and
|
|
351
|
+
# snapshot without failing the append operation.
|
|
352
|
+
num_cols = table.num_columns
|
|
353
|
+
min_k_hashes = [[] for _ in range(num_cols)]
|
|
354
|
+
HBINS = 32
|
|
355
|
+
histograms = [[0] * HBINS for _ in range(num_cols)]
|
|
356
|
+
min_values = [NULL_FLAG] * num_cols
|
|
357
|
+
max_values = [NULL_FLAG] * num_cols
|
|
358
|
+
|
|
359
|
+
entries = [
|
|
360
|
+
{
|
|
361
|
+
"file_path": data_path,
|
|
362
|
+
"file_format": "parquet",
|
|
363
|
+
"record_count": int(table.num_rows),
|
|
364
|
+
"file_size_in_bytes": len(pdata),
|
|
365
|
+
"min_k_hashes": min_k_hashes,
|
|
366
|
+
"histogram_counts": histograms,
|
|
367
|
+
"histogram_bins": HBINS,
|
|
368
|
+
"min_values": min_values,
|
|
369
|
+
"max_values": max_values,
|
|
370
|
+
}
|
|
371
|
+
]
|
|
372
|
+
|
|
373
|
+
# persist manifest: for append, merge previous manifest entries
|
|
374
|
+
# with the new entries so the snapshot's manifest is cumulative.
|
|
375
|
+
manifest_path = None
|
|
376
|
+
if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
|
|
377
|
+
merged_entries = list(entries)
|
|
378
|
+
|
|
379
|
+
# If there is a previous snapshot with a manifest, try to read
|
|
380
|
+
# it and prepend its entries. Any read error is non-fatal and we
|
|
381
|
+
# fall back to writing only the new entries.
|
|
382
|
+
prev_snap = self.snapshot(None)
|
|
383
|
+
if prev_snap and getattr(prev_snap, "manifest_list", None):
|
|
384
|
+
prev_manifest_path = prev_snap.manifest_list
|
|
385
|
+
try:
|
|
386
|
+
# Prefer FileIO when available
|
|
387
|
+
if self.io and hasattr(self.io, "new_input"):
|
|
388
|
+
inp = self.io.new_input(prev_manifest_path)
|
|
389
|
+
with inp.open() as f:
|
|
390
|
+
prev_data = f.read()
|
|
391
|
+
import pyarrow as pa
|
|
392
|
+
import pyarrow.parquet as pq
|
|
393
|
+
|
|
394
|
+
prev_table = pq.read_table(pa.BufferReader(prev_data))
|
|
395
|
+
prev_rows = prev_table.to_pylist()
|
|
396
|
+
merged_entries = prev_rows + merged_entries
|
|
397
|
+
else:
|
|
398
|
+
# Fall back to catalog storage client (GCS)
|
|
399
|
+
if (
|
|
400
|
+
self.catalog
|
|
401
|
+
and getattr(self.catalog, "_storage_client", None)
|
|
402
|
+
and getattr(self.catalog, "gcs_bucket", None)
|
|
403
|
+
):
|
|
404
|
+
bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
|
|
405
|
+
parsed = prev_manifest_path
|
|
406
|
+
if parsed.startswith("gs://"):
|
|
407
|
+
parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
|
|
408
|
+
blob = bucket.blob(parsed)
|
|
409
|
+
prev_data = blob.download_as_bytes()
|
|
410
|
+
import pyarrow as pa
|
|
411
|
+
import pyarrow.parquet as pq
|
|
412
|
+
|
|
413
|
+
prev_table = pq.read_table(pa.BufferReader(prev_data))
|
|
414
|
+
prev_rows = prev_table.to_pylist()
|
|
415
|
+
merged_entries = prev_rows + merged_entries
|
|
416
|
+
except Exception:
|
|
417
|
+
# If we can't read the previous manifest, continue with
|
|
418
|
+
# just the new entries (don't fail the append).
|
|
419
|
+
pass
|
|
420
|
+
|
|
421
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
422
|
+
snapshot_id, merged_entries, self.metadata.location
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# snapshot metadata
|
|
426
|
+
if author is None:
|
|
427
|
+
raise ValueError("author must be provided when appending to a dataset")
|
|
428
|
+
# update metadata author/timestamp for this append
|
|
429
|
+
self.metadata.author = author
|
|
430
|
+
self.metadata.timestamp_ms = snapshot_id
|
|
431
|
+
# default commit message
|
|
432
|
+
if commit_message is None:
|
|
433
|
+
commit_message = f"commit by {author}"
|
|
434
|
+
|
|
435
|
+
recs = int(table.num_rows)
|
|
436
|
+
fsize = len(pdata)
|
|
437
|
+
added_data_files = 1
|
|
438
|
+
added_files_size = fsize
|
|
439
|
+
added_records = recs
|
|
440
|
+
deleted_data_files = 0
|
|
441
|
+
deleted_files_size = 0
|
|
442
|
+
deleted_records = 0
|
|
443
|
+
|
|
444
|
+
prev = self.snapshot()
|
|
445
|
+
if prev and prev.summary:
|
|
446
|
+
try:
|
|
447
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
448
|
+
except Exception:
|
|
449
|
+
prev_total_files = 0
|
|
450
|
+
try:
|
|
451
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
452
|
+
except Exception:
|
|
453
|
+
prev_total_size = 0
|
|
454
|
+
try:
|
|
455
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
456
|
+
except Exception:
|
|
457
|
+
prev_total_records = 0
|
|
458
|
+
else:
|
|
459
|
+
prev_total_files = 0
|
|
460
|
+
prev_total_size = 0
|
|
461
|
+
prev_total_records = 0
|
|
462
|
+
|
|
463
|
+
total_data_files = prev_total_files + added_data_files - deleted_data_files
|
|
464
|
+
total_files_size = prev_total_size + added_files_size - deleted_files_size
|
|
465
|
+
total_records = prev_total_records + added_records - deleted_records
|
|
466
|
+
|
|
467
|
+
summary = {
|
|
468
|
+
"added-data-files": added_data_files,
|
|
469
|
+
"added-files-size": added_files_size,
|
|
470
|
+
"added-records": added_records,
|
|
471
|
+
"deleted-data-files": deleted_data_files,
|
|
472
|
+
"deleted-files-size": deleted_files_size,
|
|
473
|
+
"deleted-records": deleted_records,
|
|
474
|
+
"total-data-files": total_data_files,
|
|
475
|
+
"total-files-size": total_files_size,
|
|
476
|
+
"total-records": total_records,
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
# sequence number
|
|
480
|
+
try:
|
|
481
|
+
max_seq = 0
|
|
482
|
+
for s in self.metadata.snapshots:
|
|
483
|
+
seq = getattr(s, "sequence_number", None)
|
|
484
|
+
if seq is None:
|
|
485
|
+
continue
|
|
486
|
+
try:
|
|
487
|
+
ival = int(seq)
|
|
488
|
+
except Exception:
|
|
489
|
+
continue
|
|
490
|
+
if ival > max_seq:
|
|
491
|
+
max_seq = ival
|
|
492
|
+
next_seq = max_seq + 1
|
|
493
|
+
except Exception:
|
|
494
|
+
next_seq = 1
|
|
495
|
+
|
|
496
|
+
parent_id = self.metadata.current_snapshot_id
|
|
497
|
+
|
|
498
|
+
snap = Snapshot(
|
|
499
|
+
snapshot_id=snapshot_id,
|
|
500
|
+
timestamp_ms=snapshot_id,
|
|
501
|
+
author=author,
|
|
502
|
+
sequence_number=next_seq,
|
|
503
|
+
user_created=True,
|
|
504
|
+
operation_type="append",
|
|
505
|
+
parent_snapshot_id=parent_id,
|
|
506
|
+
manifest_list=manifest_path,
|
|
507
|
+
schema_id=self.metadata.current_schema_id,
|
|
508
|
+
commit_message=commit_message,
|
|
509
|
+
summary=summary,
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
self.metadata.snapshots.append(snap)
|
|
513
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
514
|
+
|
|
515
|
+
# persist metadata (let errors propagate)
|
|
516
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
517
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
518
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
519
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
520
|
+
|
|
521
|
+
def add_files(self, files: list[str], author: str = None, commit_message: Optional[str] = None):
|
|
522
|
+
"""Add filenames to the dataset manifest without writing the files.
|
|
523
|
+
|
|
524
|
+
- `files` is a list of file paths (strings). Files are assumed to
|
|
525
|
+
already exist in storage; this method only updates the manifest.
|
|
526
|
+
- Does not add files that already appear in the current manifest
|
|
527
|
+
(deduplicates by `file_path`).
|
|
528
|
+
- Creates a cumulative manifest for the new snapshot (previous
|
|
529
|
+
entries + new unique entries).
|
|
530
|
+
"""
|
|
531
|
+
if author is None:
|
|
532
|
+
raise ValueError("author must be provided when adding files to a dataset")
|
|
533
|
+
|
|
534
|
+
snapshot_id = int(time.time() * 1000)
|
|
535
|
+
|
|
536
|
+
# Gather previous summary and manifest entries
|
|
537
|
+
prev = self.snapshot(None)
|
|
538
|
+
prev_total_files = 0
|
|
539
|
+
prev_total_size = 0
|
|
540
|
+
prev_total_records = 0
|
|
541
|
+
prev_entries = []
|
|
542
|
+
if prev and prev.summary:
|
|
543
|
+
try:
|
|
544
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
545
|
+
except Exception:
|
|
546
|
+
prev_total_files = 0
|
|
547
|
+
try:
|
|
548
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
549
|
+
except Exception:
|
|
550
|
+
prev_total_size = 0
|
|
551
|
+
try:
|
|
552
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
553
|
+
except Exception:
|
|
554
|
+
prev_total_records = 0
|
|
555
|
+
|
|
556
|
+
if prev and getattr(prev, "manifest_list", None):
|
|
557
|
+
# try to read prev manifest entries
|
|
558
|
+
try:
|
|
559
|
+
import pyarrow as pa
|
|
560
|
+
import pyarrow.parquet as pq
|
|
561
|
+
|
|
562
|
+
if self.io and hasattr(self.io, "new_input"):
|
|
563
|
+
inp = self.io.new_input(prev.manifest_list)
|
|
564
|
+
with inp.open() as f:
|
|
565
|
+
data = f.read()
|
|
566
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
567
|
+
prev_entries = table.to_pylist()
|
|
568
|
+
else:
|
|
569
|
+
if (
|
|
570
|
+
self.catalog
|
|
571
|
+
and getattr(self.catalog, "_storage_client", None)
|
|
572
|
+
and getattr(self.catalog, "gcs_bucket", None)
|
|
573
|
+
):
|
|
574
|
+
bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
|
|
575
|
+
parsed = prev.manifest_list
|
|
576
|
+
if parsed.startswith("gs://"):
|
|
577
|
+
parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
|
|
578
|
+
blob = bucket.blob(parsed)
|
|
579
|
+
data = blob.download_as_bytes()
|
|
580
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
581
|
+
prev_entries = table.to_pylist()
|
|
582
|
+
except Exception:
|
|
583
|
+
prev_entries = []
|
|
584
|
+
|
|
585
|
+
existing = {
|
|
586
|
+
e.get("file_path") for e in prev_entries if isinstance(e, dict) and e.get("file_path")
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
# Build new entries for files that don't already exist. Only accept
|
|
590
|
+
# Parquet files and attempt to read lightweight metadata (bytes,
|
|
591
|
+
# row count, per-column min/max) from the Parquet footer when
|
|
592
|
+
# available.
|
|
593
|
+
new_entries = []
|
|
594
|
+
seen = set()
|
|
595
|
+
for fp in files:
|
|
596
|
+
if not fp or fp in existing or fp in seen:
|
|
597
|
+
continue
|
|
598
|
+
if not fp.lower().endswith(".parquet"):
|
|
599
|
+
# only accept parquet files
|
|
600
|
+
continue
|
|
601
|
+
seen.add(fp)
|
|
602
|
+
|
|
603
|
+
# Attempt to read file bytes and parquet metadata
|
|
604
|
+
file_size = 0
|
|
605
|
+
record_count = 0
|
|
606
|
+
min_values = []
|
|
607
|
+
max_values = []
|
|
608
|
+
try:
|
|
609
|
+
import pyarrow as pa
|
|
610
|
+
import pyarrow.parquet as pq
|
|
611
|
+
|
|
612
|
+
data = None
|
|
613
|
+
if self.io and hasattr(self.io, "new_input"):
|
|
614
|
+
inp = self.io.new_input(fp)
|
|
615
|
+
with inp.open() as f:
|
|
616
|
+
data = f.read()
|
|
617
|
+
else:
|
|
618
|
+
if (
|
|
619
|
+
self.catalog
|
|
620
|
+
and getattr(self.catalog, "_storage_client", None)
|
|
621
|
+
and getattr(self.catalog, "gcs_bucket", None)
|
|
622
|
+
):
|
|
623
|
+
bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
|
|
624
|
+
parsed = fp
|
|
625
|
+
if parsed.startswith("gs://"):
|
|
626
|
+
parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
|
|
627
|
+
blob = bucket.blob(parsed)
|
|
628
|
+
data = blob.download_as_bytes()
|
|
629
|
+
|
|
630
|
+
if data:
|
|
631
|
+
file_size = len(data)
|
|
632
|
+
pf = pq.ParquetFile(pa.BufferReader(data))
|
|
633
|
+
record_count = int(pf.metadata.num_rows or 0)
|
|
634
|
+
|
|
635
|
+
# Prefer computing min/max via draken.compress() over
|
|
636
|
+
# relying on Parquet footer stats which may contain
|
|
637
|
+
# heterogenous or non-numeric values. Fall back to
|
|
638
|
+
# footer stats only if draken is unavailable.
|
|
639
|
+
try:
|
|
640
|
+
import opteryx.draken as draken # type: ignore
|
|
641
|
+
|
|
642
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
643
|
+
ncols = table.num_columns
|
|
644
|
+
mins = [None] * ncols
|
|
645
|
+
maxs = [None] * ncols
|
|
646
|
+
|
|
647
|
+
NULL_FLAG = -(1 << 63)
|
|
648
|
+
|
|
649
|
+
for ci in range(ncols):
|
|
650
|
+
try:
|
|
651
|
+
col = table.column(ci)
|
|
652
|
+
# combine chunks if needed
|
|
653
|
+
if hasattr(col, "combine_chunks"):
|
|
654
|
+
arr = col.combine_chunks()
|
|
655
|
+
else:
|
|
656
|
+
arr = col
|
|
657
|
+
vec = draken.Vector.from_arrow(arr)
|
|
658
|
+
mapped = list(vec.compress())
|
|
659
|
+
non_nulls = [m for m in mapped if m != NULL_FLAG]
|
|
660
|
+
if non_nulls:
|
|
661
|
+
mins[ci] = int(min(non_nulls))
|
|
662
|
+
maxs[ci] = int(max(non_nulls))
|
|
663
|
+
else:
|
|
664
|
+
mins[ci] = None
|
|
665
|
+
maxs[ci] = None
|
|
666
|
+
except Exception:
|
|
667
|
+
# per-column fallback: leave None
|
|
668
|
+
mins[ci] = None
|
|
669
|
+
maxs[ci] = None
|
|
670
|
+
except Exception:
|
|
671
|
+
# Draken not available; fall back to Parquet footer stats
|
|
672
|
+
ncols = pf.metadata.num_columns
|
|
673
|
+
mins = [None] * ncols
|
|
674
|
+
maxs = [None] * ncols
|
|
675
|
+
for rg in range(pf.num_row_groups):
|
|
676
|
+
for ci in range(ncols):
|
|
677
|
+
col_meta = pf.metadata.row_group(rg).column(ci)
|
|
678
|
+
stats = getattr(col_meta, "statistics", None)
|
|
679
|
+
if not stats:
|
|
680
|
+
continue
|
|
681
|
+
smin = getattr(stats, "min", None)
|
|
682
|
+
smax = getattr(stats, "max", None)
|
|
683
|
+
if smin is None and smax is None:
|
|
684
|
+
continue
|
|
685
|
+
|
|
686
|
+
def _to_py(v):
|
|
687
|
+
try:
|
|
688
|
+
return int(v)
|
|
689
|
+
except Exception:
|
|
690
|
+
try:
|
|
691
|
+
return float(v)
|
|
692
|
+
except Exception:
|
|
693
|
+
try:
|
|
694
|
+
if isinstance(v, (bytes, bytearray)):
|
|
695
|
+
return v.decode("utf-8", errors="ignore")
|
|
696
|
+
except Exception:
|
|
697
|
+
pass
|
|
698
|
+
return v
|
|
699
|
+
|
|
700
|
+
if smin is not None:
|
|
701
|
+
sval = _to_py(smin)
|
|
702
|
+
if mins[ci] is None:
|
|
703
|
+
mins[ci] = sval
|
|
704
|
+
else:
|
|
705
|
+
try:
|
|
706
|
+
if sval < mins[ci]:
|
|
707
|
+
mins[ci] = sval
|
|
708
|
+
except Exception:
|
|
709
|
+
pass
|
|
710
|
+
if smax is not None:
|
|
711
|
+
sval = _to_py(smax)
|
|
712
|
+
if maxs[ci] is None:
|
|
713
|
+
maxs[ci] = sval
|
|
714
|
+
else:
|
|
715
|
+
try:
|
|
716
|
+
if sval > maxs[ci]:
|
|
717
|
+
maxs[ci] = sval
|
|
718
|
+
except Exception:
|
|
719
|
+
pass
|
|
720
|
+
|
|
721
|
+
# normalize lists to empty lists when values missing
|
|
722
|
+
min_values = [m for m in mins if m is not None]
|
|
723
|
+
max_values = [m for m in maxs if m is not None]
|
|
724
|
+
except Exception:
|
|
725
|
+
# If metadata read fails, fall back to placeholders
|
|
726
|
+
file_size = 0
|
|
727
|
+
record_count = 0
|
|
728
|
+
min_values = []
|
|
729
|
+
max_values = []
|
|
730
|
+
|
|
731
|
+
new_entries.append(
|
|
732
|
+
{
|
|
733
|
+
"file_path": fp,
|
|
734
|
+
"file_format": "parquet",
|
|
735
|
+
"record_count": int(record_count),
|
|
736
|
+
"file_size_in_bytes": int(file_size),
|
|
737
|
+
"min_k_hashes": [],
|
|
738
|
+
"histogram_counts": [],
|
|
739
|
+
"histogram_bins": 0,
|
|
740
|
+
"min_values": min_values,
|
|
741
|
+
"max_values": max_values,
|
|
742
|
+
}
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
merged_entries = prev_entries + new_entries
|
|
746
|
+
|
|
747
|
+
# write cumulative manifest
|
|
748
|
+
manifest_path = None
|
|
749
|
+
if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
|
|
750
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
751
|
+
snapshot_id, merged_entries, self.metadata.location
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
# Build summary deltas
|
|
755
|
+
added_data_files = len(new_entries)
|
|
756
|
+
added_files_size = 0
|
|
757
|
+
added_records = 0
|
|
758
|
+
deleted_data_files = 0
|
|
759
|
+
deleted_files_size = 0
|
|
760
|
+
deleted_records = 0
|
|
761
|
+
|
|
762
|
+
total_data_files = prev_total_files + added_data_files - deleted_data_files
|
|
763
|
+
total_files_size = prev_total_size + added_files_size - deleted_files_size
|
|
764
|
+
total_records = prev_total_records + added_records - deleted_records
|
|
765
|
+
|
|
766
|
+
summary = {
|
|
767
|
+
"added-data-files": added_data_files,
|
|
768
|
+
"added-files-size": added_files_size,
|
|
769
|
+
"added-records": added_records,
|
|
770
|
+
"deleted-data-files": deleted_data_files,
|
|
771
|
+
"deleted-files-size": deleted_files_size,
|
|
772
|
+
"deleted-records": deleted_records,
|
|
773
|
+
"total-data-files": total_data_files,
|
|
774
|
+
"total-files-size": total_files_size,
|
|
775
|
+
"total-records": total_records,
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
# Sequence number
|
|
779
|
+
try:
|
|
780
|
+
max_seq = 0
|
|
781
|
+
for s in self.metadata.snapshots:
|
|
782
|
+
seq = getattr(s, "sequence_number", None)
|
|
783
|
+
if seq is None:
|
|
784
|
+
continue
|
|
785
|
+
try:
|
|
786
|
+
ival = int(seq)
|
|
787
|
+
except Exception:
|
|
788
|
+
continue
|
|
789
|
+
if ival > max_seq:
|
|
790
|
+
max_seq = ival
|
|
791
|
+
next_seq = max_seq + 1
|
|
792
|
+
except Exception:
|
|
793
|
+
next_seq = 1
|
|
794
|
+
|
|
795
|
+
parent_id = self.metadata.current_snapshot_id
|
|
796
|
+
|
|
797
|
+
if commit_message is None:
|
|
798
|
+
commit_message = f"add files by {author}"
|
|
799
|
+
|
|
800
|
+
snap = Snapshot(
|
|
801
|
+
snapshot_id=snapshot_id,
|
|
802
|
+
timestamp_ms=snapshot_id,
|
|
803
|
+
author=author,
|
|
804
|
+
sequence_number=next_seq,
|
|
805
|
+
user_created=True,
|
|
806
|
+
operation_type="add-files",
|
|
807
|
+
parent_snapshot_id=parent_id,
|
|
808
|
+
manifest_list=manifest_path,
|
|
809
|
+
schema_id=self.metadata.current_schema_id,
|
|
810
|
+
commit_message=commit_message,
|
|
811
|
+
summary=summary,
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
self.metadata.snapshots.append(snap)
|
|
815
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
816
|
+
|
|
817
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
818
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
819
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
820
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
821
|
+
|
|
822
|
+
def truncate_and_add_files(
|
|
823
|
+
self, files: list[str], author: str = None, commit_message: Optional[str] = None
|
|
824
|
+
):
|
|
825
|
+
"""Truncate dataset (logical) and set manifest to provided files.
|
|
826
|
+
|
|
827
|
+
- Writes a manifest that contains exactly the unique filenames provided.
|
|
828
|
+
- Does not delete objects from storage.
|
|
829
|
+
- Useful for replace/overwrite semantics.
|
|
830
|
+
"""
|
|
831
|
+
if author is None:
|
|
832
|
+
raise ValueError("author must be provided when truncating/adding files")
|
|
833
|
+
|
|
834
|
+
snapshot_id = int(time.time() * 1000)
|
|
835
|
+
|
|
836
|
+
# Read previous summary for reporting deleted counts
|
|
837
|
+
prev = self.snapshot(None)
|
|
838
|
+
prev_total_files = 0
|
|
839
|
+
prev_total_size = 0
|
|
840
|
+
prev_total_records = 0
|
|
841
|
+
if prev and prev.summary:
|
|
842
|
+
try:
|
|
843
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
844
|
+
except Exception:
|
|
845
|
+
prev_total_files = 0
|
|
846
|
+
try:
|
|
847
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
848
|
+
except Exception:
|
|
849
|
+
prev_total_size = 0
|
|
850
|
+
try:
|
|
851
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
852
|
+
except Exception:
|
|
853
|
+
prev_total_records = 0
|
|
854
|
+
|
|
855
|
+
# Build unique new entries (ignore duplicates in input). Only accept
|
|
856
|
+
# parquet files and try to read lightweight metadata from each file.
|
|
857
|
+
new_entries = []
|
|
858
|
+
seen = set()
|
|
859
|
+
for fp in files:
|
|
860
|
+
if not fp or fp in seen:
|
|
861
|
+
continue
|
|
862
|
+
if not fp.lower().endswith(".parquet"):
|
|
863
|
+
continue
|
|
864
|
+
seen.add(fp)
|
|
865
|
+
|
|
866
|
+
file_size = 0
|
|
867
|
+
record_count = 0
|
|
868
|
+
min_values = []
|
|
869
|
+
max_values = []
|
|
870
|
+
try:
|
|
871
|
+
import pyarrow as pa
|
|
872
|
+
import pyarrow.parquet as pq
|
|
873
|
+
|
|
874
|
+
data = None
|
|
875
|
+
if self.io and hasattr(self.io, "new_input"):
|
|
876
|
+
inp = self.io.new_input(fp)
|
|
877
|
+
with inp.open() as f:
|
|
878
|
+
data = f.read()
|
|
879
|
+
else:
|
|
880
|
+
if (
|
|
881
|
+
self.catalog
|
|
882
|
+
and getattr(self.catalog, "_storage_client", None)
|
|
883
|
+
and getattr(self.catalog, "gcs_bucket", None)
|
|
884
|
+
):
|
|
885
|
+
bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
|
|
886
|
+
parsed = fp
|
|
887
|
+
if parsed.startswith("gs://"):
|
|
888
|
+
parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
|
|
889
|
+
blob = bucket.blob(parsed)
|
|
890
|
+
data = blob.download_as_bytes()
|
|
891
|
+
|
|
892
|
+
if data:
|
|
893
|
+
file_size = len(data)
|
|
894
|
+
pf = pq.ParquetFile(pa.BufferReader(data))
|
|
895
|
+
record_count = int(pf.metadata.num_rows or 0)
|
|
896
|
+
|
|
897
|
+
ncols = pf.metadata.num_columns
|
|
898
|
+
mins = [None] * ncols
|
|
899
|
+
maxs = [None] * ncols
|
|
900
|
+
for rg in range(pf.num_row_groups):
|
|
901
|
+
for ci in range(ncols):
|
|
902
|
+
col_meta = pf.metadata.row_group(rg).column(ci)
|
|
903
|
+
stats = getattr(col_meta, "statistics", None)
|
|
904
|
+
if not stats:
|
|
905
|
+
continue
|
|
906
|
+
smin = getattr(stats, "min", None)
|
|
907
|
+
smax = getattr(stats, "max", None)
|
|
908
|
+
if smin is None and smax is None:
|
|
909
|
+
continue
|
|
910
|
+
|
|
911
|
+
def _to_py(v):
|
|
912
|
+
try:
|
|
913
|
+
return int(v)
|
|
914
|
+
except Exception:
|
|
915
|
+
try:
|
|
916
|
+
return float(v)
|
|
917
|
+
except Exception:
|
|
918
|
+
try:
|
|
919
|
+
if isinstance(v, (bytes, bytearray)):
|
|
920
|
+
return v.decode("utf-8", errors="ignore")
|
|
921
|
+
except Exception:
|
|
922
|
+
pass
|
|
923
|
+
return v
|
|
924
|
+
|
|
925
|
+
if smin is not None:
|
|
926
|
+
sval = _to_py(smin)
|
|
927
|
+
if mins[ci] is None:
|
|
928
|
+
mins[ci] = sval
|
|
929
|
+
else:
|
|
930
|
+
try:
|
|
931
|
+
if sval < mins[ci]:
|
|
932
|
+
mins[ci] = sval
|
|
933
|
+
except Exception:
|
|
934
|
+
pass
|
|
935
|
+
if smax is not None:
|
|
936
|
+
sval = _to_py(smax)
|
|
937
|
+
if maxs[ci] is None:
|
|
938
|
+
maxs[ci] = sval
|
|
939
|
+
else:
|
|
940
|
+
try:
|
|
941
|
+
if sval > maxs[ci]:
|
|
942
|
+
maxs[ci] = sval
|
|
943
|
+
except Exception:
|
|
944
|
+
pass
|
|
945
|
+
|
|
946
|
+
min_values = [m for m in mins if m is not None]
|
|
947
|
+
max_values = [m for m in maxs if m is not None]
|
|
948
|
+
except Exception:
|
|
949
|
+
file_size = 0
|
|
950
|
+
record_count = 0
|
|
951
|
+
min_values = []
|
|
952
|
+
max_values = []
|
|
953
|
+
|
|
954
|
+
new_entries.append(
|
|
955
|
+
{
|
|
956
|
+
"file_path": fp,
|
|
957
|
+
"file_format": "parquet",
|
|
958
|
+
"record_count": int(record_count),
|
|
959
|
+
"file_size_in_bytes": int(file_size),
|
|
960
|
+
"min_k_hashes": [],
|
|
961
|
+
"histogram_counts": [],
|
|
962
|
+
"histogram_bins": 0,
|
|
963
|
+
"min_values": min_values,
|
|
964
|
+
"max_values": max_values,
|
|
965
|
+
}
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
manifest_path = None
|
|
969
|
+
if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
|
|
970
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
971
|
+
snapshot_id, new_entries, self.metadata.location
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
# Build summary: previous entries become deleted
|
|
975
|
+
deleted_data_files = prev_total_files
|
|
976
|
+
deleted_files_size = prev_total_size
|
|
977
|
+
deleted_records = prev_total_records
|
|
978
|
+
|
|
979
|
+
added_data_files = len(new_entries)
|
|
980
|
+
added_files_size = 0
|
|
981
|
+
added_records = 0
|
|
982
|
+
|
|
983
|
+
total_data_files = added_data_files
|
|
984
|
+
total_files_size = added_files_size
|
|
985
|
+
total_records = added_records
|
|
986
|
+
|
|
987
|
+
summary = {
|
|
988
|
+
"added-data-files": added_data_files,
|
|
989
|
+
"added-files-size": added_files_size,
|
|
990
|
+
"added-records": added_records,
|
|
991
|
+
"deleted-data-files": deleted_data_files,
|
|
992
|
+
"deleted-files-size": deleted_files_size,
|
|
993
|
+
"deleted-records": deleted_records,
|
|
994
|
+
"total-data-files": total_data_files,
|
|
995
|
+
"total-files-size": total_files_size,
|
|
996
|
+
"total-records": total_records,
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
# Sequence number
|
|
1000
|
+
try:
|
|
1001
|
+
max_seq = 0
|
|
1002
|
+
for s in self.metadata.snapshots:
|
|
1003
|
+
seq = getattr(s, "sequence_number", None)
|
|
1004
|
+
if seq is None:
|
|
1005
|
+
continue
|
|
1006
|
+
try:
|
|
1007
|
+
ival = int(seq)
|
|
1008
|
+
except Exception:
|
|
1009
|
+
continue
|
|
1010
|
+
if ival > max_seq:
|
|
1011
|
+
max_seq = ival
|
|
1012
|
+
next_seq = max_seq + 1
|
|
1013
|
+
except Exception:
|
|
1014
|
+
next_seq = 1
|
|
1015
|
+
|
|
1016
|
+
parent_id = self.metadata.current_snapshot_id
|
|
1017
|
+
|
|
1018
|
+
if commit_message is None:
|
|
1019
|
+
commit_message = f"truncate and add files by {author}"
|
|
1020
|
+
|
|
1021
|
+
snap = Snapshot(
|
|
1022
|
+
snapshot_id=snapshot_id,
|
|
1023
|
+
timestamp_ms=snapshot_id,
|
|
1024
|
+
author=author,
|
|
1025
|
+
sequence_number=next_seq,
|
|
1026
|
+
user_created=True,
|
|
1027
|
+
operation_type="truncate-and-add-files",
|
|
1028
|
+
parent_snapshot_id=parent_id,
|
|
1029
|
+
manifest_list=manifest_path,
|
|
1030
|
+
schema_id=self.metadata.current_schema_id,
|
|
1031
|
+
commit_message=commit_message,
|
|
1032
|
+
summary=summary,
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
# Replace in-memory snapshots: append snapshot and update current id
|
|
1036
|
+
self.metadata.snapshots.append(snap)
|
|
1037
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
1038
|
+
|
|
1039
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
1040
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
1041
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
1042
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
1043
|
+
|
|
1044
|
+
def scan(
|
|
1045
|
+
self, row_filter=None, row_limit=None, snapshot_id: Optional[int] = None
|
|
1046
|
+
) -> Iterable[Datafile]:
|
|
1047
|
+
"""Return Datafile objects for the given snapshot.
|
|
1048
|
+
|
|
1049
|
+
- If `snapshot_id` is None, use the current snapshot.
|
|
1050
|
+
- Ignore `row_filter` for now and return all files listed in the
|
|
1051
|
+
snapshot's parquet manifest (if present).
|
|
1052
|
+
"""
|
|
1053
|
+
# Determine snapshot to read using the dataset-level helper which
|
|
1054
|
+
# prefers the in-memory current snapshot and otherwise performs a
|
|
1055
|
+
# backend lookup for the requested id.
|
|
1056
|
+
snap = self.snapshot(snapshot_id)
|
|
1057
|
+
|
|
1058
|
+
if snap is None or not getattr(snap, "manifest_list", None):
|
|
1059
|
+
return iter(())
|
|
1060
|
+
|
|
1061
|
+
manifest_path = snap.manifest_list
|
|
1062
|
+
|
|
1063
|
+
# Read manifest via FileIO if available
|
|
1064
|
+
try:
|
|
1065
|
+
import pyarrow as pa
|
|
1066
|
+
import pyarrow.parquet as pq
|
|
1067
|
+
|
|
1068
|
+
data = None
|
|
1069
|
+
|
|
1070
|
+
inp = self.io.new_input(manifest_path)
|
|
1071
|
+
with inp.open() as f:
|
|
1072
|
+
data = f.read()
|
|
1073
|
+
|
|
1074
|
+
if not data:
|
|
1075
|
+
return iter(())
|
|
1076
|
+
|
|
1077
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
1078
|
+
rows = table.to_pylist()
|
|
1079
|
+
cum_rows = 0
|
|
1080
|
+
for r in rows:
|
|
1081
|
+
yield Datafile(entry=r)
|
|
1082
|
+
try:
|
|
1083
|
+
rc = int(r.get("record_count") or 0)
|
|
1084
|
+
except Exception:
|
|
1085
|
+
rc = 0
|
|
1086
|
+
cum_rows += rc
|
|
1087
|
+
if row_limit is not None and cum_rows >= row_limit:
|
|
1088
|
+
break
|
|
1089
|
+
except FileNotFoundError:
|
|
1090
|
+
return iter(())
|
|
1091
|
+
except Exception:
|
|
1092
|
+
return iter(())
|
|
1093
|
+
|
|
1094
|
+
def truncate(self, author: str = None, commit_message: Optional[str] = None) -> None:
|
|
1095
|
+
"""Delete all data files and manifests for this table.
|
|
1096
|
+
|
|
1097
|
+
This attempts to delete every data file referenced by existing
|
|
1098
|
+
Parquet manifests and then delete the manifest files themselves.
|
|
1099
|
+
Finally it clears the in-memory snapshot list and persists the
|
|
1100
|
+
empty snapshot set via the attached `catalog` (if available).
|
|
1101
|
+
"""
|
|
1102
|
+
import pyarrow as pa
|
|
1103
|
+
import pyarrow.parquet as pq
|
|
1104
|
+
|
|
1105
|
+
io = self.io
|
|
1106
|
+
# Collect files referenced by existing manifests but do NOT delete
|
|
1107
|
+
# them from storage. Instead we will write a new empty manifest and
|
|
1108
|
+
# create a truncate snapshot that records these files as deleted.
|
|
1109
|
+
snaps = list(self.metadata.snapshots)
|
|
1110
|
+
removed_files = []
|
|
1111
|
+
removed_total_size = 0
|
|
1112
|
+
|
|
1113
|
+
for snap in snaps:
|
|
1114
|
+
manifest_path = getattr(snap, "manifest_list", None)
|
|
1115
|
+
if not manifest_path:
|
|
1116
|
+
continue
|
|
1117
|
+
|
|
1118
|
+
# Read manifest via FileIO if available
|
|
1119
|
+
rows = []
|
|
1120
|
+
try:
|
|
1121
|
+
if hasattr(io, "new_input"):
|
|
1122
|
+
inp = io.new_input(manifest_path)
|
|
1123
|
+
with inp.open() as f:
|
|
1124
|
+
data = f.read()
|
|
1125
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
1126
|
+
rows = table.to_pylist()
|
|
1127
|
+
except Exception:
|
|
1128
|
+
rows = []
|
|
1129
|
+
|
|
1130
|
+
for r in rows:
|
|
1131
|
+
fp = None
|
|
1132
|
+
fsize = 0
|
|
1133
|
+
if isinstance(r, dict):
|
|
1134
|
+
fp = r.get("file_path")
|
|
1135
|
+
fsize = int(r.get("file_size_in_bytes") or 0)
|
|
1136
|
+
if not fp and "data_file" in r and isinstance(r["data_file"], dict):
|
|
1137
|
+
fp = r["data_file"].get("file_path") or r["data_file"].get("path")
|
|
1138
|
+
fsize = int(r["data_file"].get("file_size_in_bytes") or 0)
|
|
1139
|
+
|
|
1140
|
+
if fp:
|
|
1141
|
+
removed_files.append(fp)
|
|
1142
|
+
removed_total_size += fsize
|
|
1143
|
+
|
|
1144
|
+
# Create a new empty Parquet manifest (entries=[]) to represent the
|
|
1145
|
+
# truncated table for the new snapshot. Do not delete objects.
|
|
1146
|
+
snapshot_id = int(time.time() * 1000)
|
|
1147
|
+
|
|
1148
|
+
# Do NOT write an empty Parquet manifest when there are no entries.
|
|
1149
|
+
# Per policy, create the snapshot without a manifest so older
|
|
1150
|
+
# snapshots remain readable and we avoid creating empty manifest files.
|
|
1151
|
+
manifest_path = None
|
|
1152
|
+
|
|
1153
|
+
# Build summary reflecting deleted files (tracked, not removed)
|
|
1154
|
+
deleted_count = len(removed_files)
|
|
1155
|
+
deleted_size = removed_total_size
|
|
1156
|
+
|
|
1157
|
+
summary = {
|
|
1158
|
+
"added-data-files": 0,
|
|
1159
|
+
"added-files-size": 0,
|
|
1160
|
+
"added-records": 0,
|
|
1161
|
+
"deleted-data-files": deleted_count,
|
|
1162
|
+
"deleted-files-size": deleted_size,
|
|
1163
|
+
"deleted-records": 0,
|
|
1164
|
+
"total-data-files": 0,
|
|
1165
|
+
"total-files-size": 0,
|
|
1166
|
+
"total-records": 0,
|
|
1167
|
+
}
|
|
1168
|
+
|
|
1169
|
+
# Sequence number
|
|
1170
|
+
try:
|
|
1171
|
+
max_seq = 0
|
|
1172
|
+
for s in self.metadata.snapshots:
|
|
1173
|
+
seq = getattr(s, "sequence_number", None)
|
|
1174
|
+
if seq is None:
|
|
1175
|
+
continue
|
|
1176
|
+
try:
|
|
1177
|
+
ival = int(seq)
|
|
1178
|
+
except Exception:
|
|
1179
|
+
continue
|
|
1180
|
+
if ival > max_seq:
|
|
1181
|
+
max_seq = ival
|
|
1182
|
+
next_seq = max_seq + 1
|
|
1183
|
+
except Exception:
|
|
1184
|
+
next_seq = 1
|
|
1185
|
+
|
|
1186
|
+
if author is None:
|
|
1187
|
+
raise ValueError(
|
|
1188
|
+
"truncate() must be called with an explicit author; use truncate(author=...) in caller"
|
|
1189
|
+
)
|
|
1190
|
+
# update metadata author/timestamp for this truncate
|
|
1191
|
+
self.metadata.author = author
|
|
1192
|
+
self.metadata.timestamp_ms = snapshot_id
|
|
1193
|
+
# default commit message
|
|
1194
|
+
if commit_message is None:
|
|
1195
|
+
commit_message = f"commit by {author}"
|
|
1196
|
+
|
|
1197
|
+
parent_id = self.metadata.current_snapshot_id
|
|
1198
|
+
|
|
1199
|
+
snap = Snapshot(
|
|
1200
|
+
snapshot_id=snapshot_id,
|
|
1201
|
+
timestamp_ms=snapshot_id,
|
|
1202
|
+
author=author,
|
|
1203
|
+
sequence_number=next_seq,
|
|
1204
|
+
user_created=True,
|
|
1205
|
+
operation_type="truncate",
|
|
1206
|
+
parent_snapshot_id=parent_id,
|
|
1207
|
+
manifest_list=manifest_path,
|
|
1208
|
+
schema_id=self.metadata.current_schema_id,
|
|
1209
|
+
commit_message=commit_message,
|
|
1210
|
+
summary=summary,
|
|
1211
|
+
)
|
|
1212
|
+
|
|
1213
|
+
# Append new snapshot and update current snapshot id
|
|
1214
|
+
self.metadata.snapshots.append(snap)
|
|
1215
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
1216
|
+
|
|
1217
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
1218
|
+
try:
|
|
1219
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
1220
|
+
except Exception:
|
|
1221
|
+
pass
|