opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/__init__.py +1 -1
- opteryx_catalog/catalog/__init__.py +2 -1
- opteryx_catalog/catalog/compaction.py +529 -0
- opteryx_catalog/catalog/dataset.py +433 -451
- opteryx_catalog/catalog/manifest.py +415 -0
- opteryx_catalog/catalog/metadata.py +2 -2
- opteryx_catalog/catalog/metastore.py +2 -2
- opteryx_catalog/exceptions.py +1 -1
- opteryx_catalog/iops/gcs.py +35 -5
- opteryx_catalog/opteryx_catalog.py +257 -231
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/METADATA +1 -1
- opteryx_catalog-0.4.11.dist-info/RECORD +25 -0
- scripts/create_dataset.py +1 -1
- scripts/read_dataset.py +1 -1
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +14 -0
- opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/WHEEL +0 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/top_level.txt +0 -0
|
@@ -8,6 +8,9 @@ from typing import Any
|
|
|
8
8
|
from typing import Iterable
|
|
9
9
|
from typing import Optional
|
|
10
10
|
|
|
11
|
+
from .manifest import ParquetManifestEntry
|
|
12
|
+
from .manifest import build_parquet_manifest_entry
|
|
13
|
+
from .manifest import build_parquet_manifest_minmax_entry
|
|
11
14
|
from .metadata import DatasetMetadata
|
|
12
15
|
from .metadata import Snapshot
|
|
13
16
|
from .metastore import Dataset
|
|
@@ -69,6 +72,26 @@ class SimpleDataset(Dataset):
|
|
|
69
72
|
def metadata(self) -> DatasetMetadata:
|
|
70
73
|
return self._metadata
|
|
71
74
|
|
|
75
|
+
def _next_sequence_number(self) -> int:
|
|
76
|
+
"""Calculate the next sequence number.
|
|
77
|
+
|
|
78
|
+
Uses the current snapshot's sequence number + 1. Works efficiently
|
|
79
|
+
with load_history=False since we only need the most recent snapshot,
|
|
80
|
+
not the full history.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
The next sequence number (current snapshot's sequence + 1, or 1 if no snapshots).
|
|
84
|
+
"""
|
|
85
|
+
if not self.metadata.snapshots:
|
|
86
|
+
# No snapshots yet - this is the first one
|
|
87
|
+
return 1
|
|
88
|
+
|
|
89
|
+
# Get the current (most recent) snapshot - should have the highest sequence number
|
|
90
|
+
current = self.snapshot()
|
|
91
|
+
if current:
|
|
92
|
+
seq = getattr(current, "sequence_number", None)
|
|
93
|
+
return int(seq) + 1 if seq is not None else 1
|
|
94
|
+
|
|
72
95
|
def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Snapshot]:
|
|
73
96
|
"""Return a Snapshot.
|
|
74
97
|
|
|
@@ -95,20 +118,17 @@ class SimpleDataset(Dataset):
|
|
|
95
118
|
if doc.exists:
|
|
96
119
|
sd = doc.to_dict() or {}
|
|
97
120
|
snap = Snapshot(
|
|
98
|
-
snapshot_id=int(
|
|
99
|
-
|
|
100
|
-
),
|
|
101
|
-
timestamp_ms=int(sd.get("timestamp-ms") or sd.get("timestamp_ms") or 0),
|
|
121
|
+
snapshot_id=int(sd.get("snapshot-id") or snapshot_id),
|
|
122
|
+
timestamp_ms=int(sd.get("timestamp-ms", 0)),
|
|
102
123
|
author=sd.get("author"),
|
|
103
|
-
sequence_number=sd.get("sequence-number"
|
|
104
|
-
user_created=sd.get("user-created")
|
|
105
|
-
manifest_list=sd.get("manifest")
|
|
106
|
-
schema_id=sd.get("schema-id")
|
|
124
|
+
sequence_number=sd.get("sequence-number", 0),
|
|
125
|
+
user_created=sd.get("user-created"),
|
|
126
|
+
manifest_list=sd.get("manifest"),
|
|
127
|
+
schema_id=sd.get("schema-id"),
|
|
107
128
|
summary=sd.get("summary", {}),
|
|
108
|
-
operation_type=sd.get("operation-type")
|
|
109
|
-
parent_snapshot_id=sd.get("parent-snapshot-id")
|
|
110
|
-
|
|
111
|
-
commit_message=sd.get("commit-message") or sd.get("commit_message"),
|
|
129
|
+
operation_type=sd.get("operation-type"),
|
|
130
|
+
parent_snapshot_id=sd.get("parent-snapshot-id"),
|
|
131
|
+
commit_message=sd.get("commit-message"),
|
|
112
132
|
)
|
|
113
133
|
return snap
|
|
114
134
|
except Exception:
|
|
@@ -227,148 +247,9 @@ class SimpleDataset(Dataset):
|
|
|
227
247
|
if not hasattr(table, "schema"):
|
|
228
248
|
raise TypeError("append() expects a pyarrow.Table-like object")
|
|
229
249
|
|
|
230
|
-
# Write
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
buf = pa.BufferOutputStream()
|
|
234
|
-
pq.write_table(table, buf, compression="zstd")
|
|
235
|
-
pdata = buf.getvalue().to_pybytes()
|
|
236
|
-
|
|
237
|
-
out = self.io.new_output(data_path).create()
|
|
238
|
-
out.write(pdata)
|
|
239
|
-
out.close()
|
|
240
|
-
|
|
241
|
-
# Prepare sketches/stats
|
|
242
|
-
K = 32
|
|
243
|
-
HBINS = 32
|
|
244
|
-
min_k_hashes: list[list[int]] = []
|
|
245
|
-
histograms: list[list[int]] = []
|
|
246
|
-
min_values: list[int] = []
|
|
247
|
-
max_values: list[int] = []
|
|
248
|
-
|
|
249
|
-
# Use draken for efficient hashing and compression when available.
|
|
250
|
-
import heapq
|
|
251
|
-
|
|
252
|
-
# canonical NULL flag for missing values
|
|
253
|
-
NULL_FLAG = -(1 << 63)
|
|
254
|
-
|
|
255
|
-
try:
|
|
256
|
-
import opteryx.draken as draken # type: ignore
|
|
257
|
-
|
|
258
|
-
num_rows = int(table.num_rows)
|
|
259
|
-
|
|
260
|
-
for col_idx, col in enumerate(table.columns):
|
|
261
|
-
# hash column values to 64-bit via draken (new cpdef API)
|
|
262
|
-
vec = draken.Vector.from_arrow(col)
|
|
263
|
-
hashes = list(vec.hash())
|
|
264
|
-
|
|
265
|
-
# Decide whether to compute min-k/histogram for this column based
|
|
266
|
-
# on field type and, for strings, average length of values.
|
|
267
|
-
field_type = table.schema.field(col_idx).type
|
|
268
|
-
compute_min_k = False
|
|
269
|
-
if (
|
|
270
|
-
pa.types.is_integer(field_type)
|
|
271
|
-
or pa.types.is_floating(field_type)
|
|
272
|
-
or pa.types.is_decimal(field_type)
|
|
273
|
-
):
|
|
274
|
-
compute_min_k = True
|
|
275
|
-
elif (
|
|
276
|
-
pa.types.is_timestamp(field_type)
|
|
277
|
-
or pa.types.is_date(field_type)
|
|
278
|
-
or pa.types.is_time(field_type)
|
|
279
|
-
):
|
|
280
|
-
compute_min_k = True
|
|
281
|
-
elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
|
|
282
|
-
# compute average length from non-null values; only allow
|
|
283
|
-
# min-k/histogram for short strings (avg <= 16)
|
|
284
|
-
col_py = None
|
|
285
|
-
try:
|
|
286
|
-
col_py = col.to_pylist()
|
|
287
|
-
except Exception:
|
|
288
|
-
col_py = None
|
|
289
|
-
|
|
290
|
-
if col_py is not None:
|
|
291
|
-
lens = [len(x) for x in col_py if x is not None]
|
|
292
|
-
if lens:
|
|
293
|
-
avg_len = sum(lens) / len(lens)
|
|
294
|
-
if avg_len <= 16:
|
|
295
|
-
compute_min_k = True
|
|
296
|
-
|
|
297
|
-
# KMV: take K smallest hashes when allowed; otherwise store an
|
|
298
|
-
# empty list for this column.
|
|
299
|
-
if compute_min_k:
|
|
300
|
-
smallest = heapq.nsmallest(K, hashes)
|
|
301
|
-
col_min_k = sorted(smallest)
|
|
302
|
-
else:
|
|
303
|
-
col_min_k = []
|
|
304
|
-
|
|
305
|
-
# For histogram decisions follow the same rule as min-k
|
|
306
|
-
compute_hist = compute_min_k
|
|
307
|
-
|
|
308
|
-
# Use draken.compress() to get canonical int64 per value
|
|
309
|
-
mapped = list(vec.compress())
|
|
310
|
-
non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
|
|
311
|
-
if non_nulls_mapped:
|
|
312
|
-
vmin = min(non_nulls_mapped)
|
|
313
|
-
vmax = max(non_nulls_mapped)
|
|
314
|
-
col_min = int(vmin)
|
|
315
|
-
col_max = int(vmax)
|
|
316
|
-
if compute_hist:
|
|
317
|
-
if vmin == vmax:
|
|
318
|
-
col_hist = [0] * HBINS
|
|
319
|
-
col_hist[-1] = len(non_nulls_mapped)
|
|
320
|
-
else:
|
|
321
|
-
col_hist = [0] * HBINS
|
|
322
|
-
span = float(vmax - vmin)
|
|
323
|
-
for m in non_nulls_mapped:
|
|
324
|
-
b = int(((float(m) - float(vmin)) / span) * (HBINS - 1))
|
|
325
|
-
if b < 0:
|
|
326
|
-
b = 0
|
|
327
|
-
if b >= HBINS:
|
|
328
|
-
b = HBINS - 1
|
|
329
|
-
col_hist[b] += 1
|
|
330
|
-
else:
|
|
331
|
-
col_hist = [0] * HBINS
|
|
332
|
-
else:
|
|
333
|
-
# no non-null values; histogram via hash buckets
|
|
334
|
-
col_min = NULL_FLAG
|
|
335
|
-
col_max = NULL_FLAG
|
|
336
|
-
if compute_hist:
|
|
337
|
-
col_hist = [0] * HBINS
|
|
338
|
-
for h in hashes:
|
|
339
|
-
b = (h >> (64 - 5)) & 0x1F
|
|
340
|
-
col_hist[b] += 1
|
|
341
|
-
else:
|
|
342
|
-
col_hist = [0] * HBINS
|
|
343
|
-
|
|
344
|
-
min_k_hashes.append(col_min_k)
|
|
345
|
-
histograms.append(col_hist)
|
|
346
|
-
min_values.append(col_min)
|
|
347
|
-
max_values.append(col_max)
|
|
348
|
-
except Exception:
|
|
349
|
-
# If draken or its dependencies are unavailable, fall back to
|
|
350
|
-
# conservative defaults so we can still write the manifest and
|
|
351
|
-
# snapshot without failing the append operation.
|
|
352
|
-
num_cols = table.num_columns
|
|
353
|
-
min_k_hashes = [[] for _ in range(num_cols)]
|
|
354
|
-
HBINS = 32
|
|
355
|
-
histograms = [[0] * HBINS for _ in range(num_cols)]
|
|
356
|
-
min_values = [NULL_FLAG] * num_cols
|
|
357
|
-
max_values = [NULL_FLAG] * num_cols
|
|
358
|
-
|
|
359
|
-
entries = [
|
|
360
|
-
{
|
|
361
|
-
"file_path": data_path,
|
|
362
|
-
"file_format": "parquet",
|
|
363
|
-
"record_count": int(table.num_rows),
|
|
364
|
-
"file_size_in_bytes": len(pdata),
|
|
365
|
-
"min_k_hashes": min_k_hashes,
|
|
366
|
-
"histogram_counts": histograms,
|
|
367
|
-
"histogram_bins": HBINS,
|
|
368
|
-
"min_values": min_values,
|
|
369
|
-
"max_values": max_values,
|
|
370
|
-
}
|
|
371
|
-
]
|
|
250
|
+
# Write table and build manifest entry
|
|
251
|
+
manifest_entry = self._write_table_and_build_entry(table)
|
|
252
|
+
entries = [manifest_entry.to_dict()]
|
|
372
253
|
|
|
373
254
|
# persist manifest: for append, merge previous manifest entries
|
|
374
255
|
# with the new entries so the snapshot's manifest is cumulative.
|
|
@@ -384,35 +265,15 @@ class SimpleDataset(Dataset):
|
|
|
384
265
|
prev_manifest_path = prev_snap.manifest_list
|
|
385
266
|
try:
|
|
386
267
|
# Prefer FileIO when available
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
merged_entries = prev_rows + merged_entries
|
|
397
|
-
else:
|
|
398
|
-
# Fall back to catalog storage client (GCS)
|
|
399
|
-
if (
|
|
400
|
-
self.catalog
|
|
401
|
-
and getattr(self.catalog, "_storage_client", None)
|
|
402
|
-
and getattr(self.catalog, "gcs_bucket", None)
|
|
403
|
-
):
|
|
404
|
-
bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
|
|
405
|
-
parsed = prev_manifest_path
|
|
406
|
-
if parsed.startswith("gs://"):
|
|
407
|
-
parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
|
|
408
|
-
blob = bucket.blob(parsed)
|
|
409
|
-
prev_data = blob.download_as_bytes()
|
|
410
|
-
import pyarrow as pa
|
|
411
|
-
import pyarrow.parquet as pq
|
|
412
|
-
|
|
413
|
-
prev_table = pq.read_table(pa.BufferReader(prev_data))
|
|
414
|
-
prev_rows = prev_table.to_pylist()
|
|
415
|
-
merged_entries = prev_rows + merged_entries
|
|
268
|
+
inp = self.io.new_input(prev_manifest_path)
|
|
269
|
+
with inp.open() as f:
|
|
270
|
+
prev_data = f.read()
|
|
271
|
+
import pyarrow as pa
|
|
272
|
+
import pyarrow.parquet as pq
|
|
273
|
+
|
|
274
|
+
prev_table = pq.read_table(pa.BufferReader(prev_data))
|
|
275
|
+
prev_rows = prev_table.to_pylist()
|
|
276
|
+
merged_entries = prev_rows + merged_entries
|
|
416
277
|
except Exception:
|
|
417
278
|
# If we can't read the previous manifest, continue with
|
|
418
279
|
# just the new entries (don't fail the append).
|
|
@@ -433,63 +294,52 @@ class SimpleDataset(Dataset):
|
|
|
433
294
|
commit_message = f"commit by {author}"
|
|
434
295
|
|
|
435
296
|
recs = int(table.num_rows)
|
|
436
|
-
fsize =
|
|
297
|
+
fsize = int(getattr(manifest_entry, "file_size_in_bytes", 0))
|
|
298
|
+
# Calculate uncompressed size from the manifest entry
|
|
299
|
+
added_data_size = manifest_entry.uncompressed_size_in_bytes
|
|
437
300
|
added_data_files = 1
|
|
438
301
|
added_files_size = fsize
|
|
439
302
|
added_records = recs
|
|
440
303
|
deleted_data_files = 0
|
|
441
304
|
deleted_files_size = 0
|
|
305
|
+
deleted_data_size = 0
|
|
442
306
|
deleted_records = 0
|
|
443
307
|
|
|
444
308
|
prev = self.snapshot()
|
|
445
309
|
if prev and prev.summary:
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
try:
|
|
451
|
-
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
452
|
-
except Exception:
|
|
453
|
-
prev_total_size = 0
|
|
454
|
-
try:
|
|
455
|
-
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
456
|
-
except Exception:
|
|
457
|
-
prev_total_records = 0
|
|
310
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
311
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
312
|
+
prev_total_data_size = int(prev.summary.get("total-data-size", 0))
|
|
313
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
458
314
|
else:
|
|
459
315
|
prev_total_files = 0
|
|
460
316
|
prev_total_size = 0
|
|
317
|
+
prev_total_data_size = 0
|
|
461
318
|
prev_total_records = 0
|
|
462
319
|
|
|
463
320
|
total_data_files = prev_total_files + added_data_files - deleted_data_files
|
|
464
321
|
total_files_size = prev_total_size + added_files_size - deleted_files_size
|
|
322
|
+
total_data_size = prev_total_data_size + added_data_size - deleted_data_size
|
|
465
323
|
total_records = prev_total_records + added_records - deleted_records
|
|
466
324
|
|
|
467
325
|
summary = {
|
|
468
326
|
"added-data-files": added_data_files,
|
|
469
327
|
"added-files-size": added_files_size,
|
|
328
|
+
"added-data-size": added_data_size,
|
|
470
329
|
"added-records": added_records,
|
|
471
330
|
"deleted-data-files": deleted_data_files,
|
|
472
331
|
"deleted-files-size": deleted_files_size,
|
|
332
|
+
"deleted-data-size": deleted_data_size,
|
|
473
333
|
"deleted-records": deleted_records,
|
|
474
334
|
"total-data-files": total_data_files,
|
|
475
335
|
"total-files-size": total_files_size,
|
|
336
|
+
"total-data-size": total_data_size,
|
|
476
337
|
"total-records": total_records,
|
|
477
338
|
}
|
|
478
339
|
|
|
479
340
|
# sequence number
|
|
480
341
|
try:
|
|
481
|
-
|
|
482
|
-
for s in self.metadata.snapshots:
|
|
483
|
-
seq = getattr(s, "sequence_number", None)
|
|
484
|
-
if seq is None:
|
|
485
|
-
continue
|
|
486
|
-
try:
|
|
487
|
-
ival = int(seq)
|
|
488
|
-
except Exception:
|
|
489
|
-
continue
|
|
490
|
-
if ival > max_seq:
|
|
491
|
-
max_seq = ival
|
|
492
|
-
next_seq = max_seq + 1
|
|
342
|
+
next_seq = self._next_sequence_number()
|
|
493
343
|
except Exception:
|
|
494
344
|
next_seq = 1
|
|
495
345
|
|
|
@@ -518,6 +368,136 @@ class SimpleDataset(Dataset):
|
|
|
518
368
|
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
519
369
|
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
520
370
|
|
|
371
|
+
def _write_table_and_build_entry(self, table: Any):
|
|
372
|
+
"""Write a PyArrow table to storage and return a ParquetManifestEntry.
|
|
373
|
+
|
|
374
|
+
This centralizes the IO and manifest construction so other operations
|
|
375
|
+
(e.g. `overwrite`) can reuse the same behavior as `append`.
|
|
376
|
+
"""
|
|
377
|
+
# Write parquet file with collision-resistant name
|
|
378
|
+
fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
|
|
379
|
+
data_path = f"{self.metadata.location}/data/{fname}"
|
|
380
|
+
|
|
381
|
+
import pyarrow as pa
|
|
382
|
+
import pyarrow.parquet as pq
|
|
383
|
+
|
|
384
|
+
buf = pa.BufferOutputStream()
|
|
385
|
+
pq.write_table(table, buf, compression="zstd")
|
|
386
|
+
pdata = buf.getvalue().to_pybytes()
|
|
387
|
+
|
|
388
|
+
out = self.io.new_output(data_path).create()
|
|
389
|
+
out.write(pdata)
|
|
390
|
+
out.close()
|
|
391
|
+
|
|
392
|
+
# Build manifest entry with statistics
|
|
393
|
+
manifest_entry = build_parquet_manifest_entry(table, data_path, len(pdata))
|
|
394
|
+
return manifest_entry
|
|
395
|
+
|
|
396
|
+
def overwrite(self, table: Any, author: str = None, commit_message: Optional[str] = None):
|
|
397
|
+
"""Replace the dataset entirely with `table` in a single snapshot.
|
|
398
|
+
|
|
399
|
+
Semantics:
|
|
400
|
+
- Write the provided table as new data file(s)
|
|
401
|
+
- Create a new parquet manifest that contains only the new entries
|
|
402
|
+
- Create a snapshot that records previous files as deleted and the
|
|
403
|
+
new files as added (logical replace)
|
|
404
|
+
"""
|
|
405
|
+
# Similar validation as append
|
|
406
|
+
snapshot_id = int(time.time() * 1000)
|
|
407
|
+
|
|
408
|
+
if not hasattr(table, "schema"):
|
|
409
|
+
raise TypeError("overwrite() expects a pyarrow.Table-like object")
|
|
410
|
+
|
|
411
|
+
if author is None:
|
|
412
|
+
raise ValueError("author must be provided when overwriting a dataset")
|
|
413
|
+
|
|
414
|
+
# Write new data and build manifest entries (single table -> single entry)
|
|
415
|
+
manifest_entry = self._write_table_and_build_entry(table)
|
|
416
|
+
new_entries = [manifest_entry.to_dict()]
|
|
417
|
+
|
|
418
|
+
# Write manifest containing only the new entries
|
|
419
|
+
manifest_path = None
|
|
420
|
+
if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
|
|
421
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
422
|
+
snapshot_id, new_entries, self.metadata.location
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Compute deltas: previous manifest becomes deleted
|
|
426
|
+
prev = self.snapshot(None)
|
|
427
|
+
prev_total_files = 0
|
|
428
|
+
prev_total_size = 0
|
|
429
|
+
prev_total_data_size = 0
|
|
430
|
+
prev_total_records = 0
|
|
431
|
+
if prev and prev.summary:
|
|
432
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
433
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
434
|
+
prev_total_data_size = int(prev.summary.get("total-data-size", 0))
|
|
435
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
436
|
+
|
|
437
|
+
deleted_data_files = prev_total_files
|
|
438
|
+
deleted_files_size = prev_total_size
|
|
439
|
+
deleted_data_size = prev_total_data_size
|
|
440
|
+
deleted_records = prev_total_records
|
|
441
|
+
|
|
442
|
+
added_data_files = len(new_entries)
|
|
443
|
+
added_files_size = sum(e.get("file_size_in_bytes", 0) for e in new_entries)
|
|
444
|
+
added_data_size = sum(e.get("uncompressed_size_in_bytes", 0) for e in new_entries)
|
|
445
|
+
added_records = sum(e.get("record_count", 0) for e in new_entries)
|
|
446
|
+
|
|
447
|
+
total_data_files = added_data_files
|
|
448
|
+
total_files_size = added_files_size
|
|
449
|
+
total_data_size = added_data_size
|
|
450
|
+
total_records = added_records
|
|
451
|
+
|
|
452
|
+
summary = {
|
|
453
|
+
"added-data-files": added_data_files,
|
|
454
|
+
"added-files-size": added_files_size,
|
|
455
|
+
"added-data-size": added_data_size,
|
|
456
|
+
"added-records": added_records,
|
|
457
|
+
"deleted-data-files": deleted_data_files,
|
|
458
|
+
"deleted-files-size": deleted_files_size,
|
|
459
|
+
"deleted-data-size": deleted_data_size,
|
|
460
|
+
"deleted-records": deleted_records,
|
|
461
|
+
"total-data-files": total_data_files,
|
|
462
|
+
"total-files-size": total_files_size,
|
|
463
|
+
"total-data-size": total_data_size,
|
|
464
|
+
"total-records": total_records,
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
# sequence number
|
|
468
|
+
try:
|
|
469
|
+
next_seq = self._next_sequence_number()
|
|
470
|
+
except Exception:
|
|
471
|
+
next_seq = 1
|
|
472
|
+
|
|
473
|
+
parent_id = self.metadata.current_snapshot_id
|
|
474
|
+
|
|
475
|
+
if commit_message is None:
|
|
476
|
+
commit_message = f"overwrite by {author}"
|
|
477
|
+
|
|
478
|
+
snap = Snapshot(
|
|
479
|
+
snapshot_id=snapshot_id,
|
|
480
|
+
timestamp_ms=snapshot_id,
|
|
481
|
+
author=author,
|
|
482
|
+
sequence_number=next_seq,
|
|
483
|
+
user_created=True,
|
|
484
|
+
operation_type="overwrite",
|
|
485
|
+
parent_snapshot_id=parent_id,
|
|
486
|
+
manifest_list=manifest_path,
|
|
487
|
+
schema_id=self.metadata.current_schema_id,
|
|
488
|
+
commit_message=commit_message,
|
|
489
|
+
summary=summary,
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
# Replace in-memory snapshots
|
|
493
|
+
self.metadata.snapshots.append(snap)
|
|
494
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
495
|
+
|
|
496
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
497
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
498
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
499
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
500
|
+
|
|
521
501
|
def add_files(self, files: list[str], author: str = None, commit_message: Optional[str] = None):
|
|
522
502
|
"""Add filenames to the dataset manifest without writing the files.
|
|
523
503
|
|
|
@@ -540,45 +520,20 @@ class SimpleDataset(Dataset):
|
|
|
540
520
|
prev_total_records = 0
|
|
541
521
|
prev_entries = []
|
|
542
522
|
if prev and prev.summary:
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
prev_total_files = 0
|
|
547
|
-
try:
|
|
548
|
-
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
549
|
-
except Exception:
|
|
550
|
-
prev_total_size = 0
|
|
551
|
-
try:
|
|
552
|
-
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
553
|
-
except Exception:
|
|
554
|
-
prev_total_records = 0
|
|
555
|
-
|
|
523
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
524
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
525
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
556
526
|
if prev and getattr(prev, "manifest_list", None):
|
|
557
527
|
# try to read prev manifest entries
|
|
558
528
|
try:
|
|
559
529
|
import pyarrow as pa
|
|
560
530
|
import pyarrow.parquet as pq
|
|
561
531
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
prev_entries = table.to_pylist()
|
|
568
|
-
else:
|
|
569
|
-
if (
|
|
570
|
-
self.catalog
|
|
571
|
-
and getattr(self.catalog, "_storage_client", None)
|
|
572
|
-
and getattr(self.catalog, "gcs_bucket", None)
|
|
573
|
-
):
|
|
574
|
-
bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
|
|
575
|
-
parsed = prev.manifest_list
|
|
576
|
-
if parsed.startswith("gs://"):
|
|
577
|
-
parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
|
|
578
|
-
blob = bucket.blob(parsed)
|
|
579
|
-
data = blob.download_as_bytes()
|
|
580
|
-
table = pq.read_table(pa.BufferReader(data))
|
|
581
|
-
prev_entries = table.to_pylist()
|
|
532
|
+
inp = self.io.new_input(prev.manifest_list)
|
|
533
|
+
with inp.open() as f:
|
|
534
|
+
data = f.read()
|
|
535
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
536
|
+
prev_entries = table.to_pylist()
|
|
582
537
|
except Exception:
|
|
583
538
|
prev_entries = []
|
|
584
539
|
|
|
@@ -601,146 +556,47 @@ class SimpleDataset(Dataset):
|
|
|
601
556
|
seen.add(fp)
|
|
602
557
|
|
|
603
558
|
# Attempt to read file bytes and parquet metadata
|
|
604
|
-
|
|
605
|
-
record_count = 0
|
|
606
|
-
min_values = []
|
|
607
|
-
max_values = []
|
|
559
|
+
# Use rugo's metadata reader which is much faster (microseconds per file)
|
|
608
560
|
try:
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
data = None
|
|
613
|
-
if self.io and hasattr(self.io, "new_input"):
|
|
614
|
-
inp = self.io.new_input(fp)
|
|
615
|
-
with inp.open() as f:
|
|
616
|
-
data = f.read()
|
|
617
|
-
else:
|
|
618
|
-
if (
|
|
619
|
-
self.catalog
|
|
620
|
-
and getattr(self.catalog, "_storage_client", None)
|
|
621
|
-
and getattr(self.catalog, "gcs_bucket", None)
|
|
622
|
-
):
|
|
623
|
-
bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
|
|
624
|
-
parsed = fp
|
|
625
|
-
if parsed.startswith("gs://"):
|
|
626
|
-
parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
|
|
627
|
-
blob = bucket.blob(parsed)
|
|
628
|
-
data = blob.download_as_bytes()
|
|
561
|
+
inp = self.io.new_input(fp)
|
|
562
|
+
with inp.open() as f:
|
|
563
|
+
data = f.read()
|
|
629
564
|
|
|
630
565
|
if data:
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
for ci in range(ncols):
|
|
650
|
-
try:
|
|
651
|
-
col = table.column(ci)
|
|
652
|
-
# combine chunks if needed
|
|
653
|
-
if hasattr(col, "combine_chunks"):
|
|
654
|
-
arr = col.combine_chunks()
|
|
655
|
-
else:
|
|
656
|
-
arr = col
|
|
657
|
-
vec = draken.Vector.from_arrow(arr)
|
|
658
|
-
mapped = list(vec.compress())
|
|
659
|
-
non_nulls = [m for m in mapped if m != NULL_FLAG]
|
|
660
|
-
if non_nulls:
|
|
661
|
-
mins[ci] = int(min(non_nulls))
|
|
662
|
-
maxs[ci] = int(max(non_nulls))
|
|
663
|
-
else:
|
|
664
|
-
mins[ci] = None
|
|
665
|
-
maxs[ci] = None
|
|
666
|
-
except Exception:
|
|
667
|
-
# per-column fallback: leave None
|
|
668
|
-
mins[ci] = None
|
|
669
|
-
maxs[ci] = None
|
|
670
|
-
except Exception:
|
|
671
|
-
# Draken not available; fall back to Parquet footer stats
|
|
672
|
-
ncols = pf.metadata.num_columns
|
|
673
|
-
mins = [None] * ncols
|
|
674
|
-
maxs = [None] * ncols
|
|
675
|
-
for rg in range(pf.num_row_groups):
|
|
676
|
-
for ci in range(ncols):
|
|
677
|
-
col_meta = pf.metadata.row_group(rg).column(ci)
|
|
678
|
-
stats = getattr(col_meta, "statistics", None)
|
|
679
|
-
if not stats:
|
|
680
|
-
continue
|
|
681
|
-
smin = getattr(stats, "min", None)
|
|
682
|
-
smax = getattr(stats, "max", None)
|
|
683
|
-
if smin is None and smax is None:
|
|
684
|
-
continue
|
|
685
|
-
|
|
686
|
-
def _to_py(v):
|
|
687
|
-
try:
|
|
688
|
-
return int(v)
|
|
689
|
-
except Exception:
|
|
690
|
-
try:
|
|
691
|
-
return float(v)
|
|
692
|
-
except Exception:
|
|
693
|
-
try:
|
|
694
|
-
if isinstance(v, (bytes, bytearray)):
|
|
695
|
-
return v.decode("utf-8", errors="ignore")
|
|
696
|
-
except Exception:
|
|
697
|
-
pass
|
|
698
|
-
return v
|
|
699
|
-
|
|
700
|
-
if smin is not None:
|
|
701
|
-
sval = _to_py(smin)
|
|
702
|
-
if mins[ci] is None:
|
|
703
|
-
mins[ci] = sval
|
|
704
|
-
else:
|
|
705
|
-
try:
|
|
706
|
-
if sval < mins[ci]:
|
|
707
|
-
mins[ci] = sval
|
|
708
|
-
except Exception:
|
|
709
|
-
pass
|
|
710
|
-
if smax is not None:
|
|
711
|
-
sval = _to_py(smax)
|
|
712
|
-
if maxs[ci] is None:
|
|
713
|
-
maxs[ci] = sval
|
|
714
|
-
else:
|
|
715
|
-
try:
|
|
716
|
-
if sval > maxs[ci]:
|
|
717
|
-
maxs[ci] = sval
|
|
718
|
-
except Exception:
|
|
719
|
-
pass
|
|
720
|
-
|
|
721
|
-
# normalize lists to empty lists when values missing
|
|
722
|
-
min_values = [m for m in mins if m is not None]
|
|
723
|
-
max_values = [m for m in maxs if m is not None]
|
|
566
|
+
manifest_entry = build_parquet_manifest_minmax_entry(data, fp)
|
|
567
|
+
else:
|
|
568
|
+
# Empty file, create placeholder entry
|
|
569
|
+
manifest_entry = ParquetManifestEntry(
|
|
570
|
+
file_path=fp,
|
|
571
|
+
file_format="parquet",
|
|
572
|
+
record_count=0,
|
|
573
|
+
null_counts=[],
|
|
574
|
+
file_size_in_bytes=0,
|
|
575
|
+
uncompressed_size_in_bytes=0,
|
|
576
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
577
|
+
min_k_hashes=[],
|
|
578
|
+
histogram_counts=[],
|
|
579
|
+
histogram_bins=0,
|
|
580
|
+
min_values=[],
|
|
581
|
+
max_values=[],
|
|
582
|
+
)
|
|
724
583
|
except Exception:
|
|
725
584
|
# If metadata read fails, fall back to placeholders
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
"max_values": max_values,
|
|
742
|
-
}
|
|
743
|
-
)
|
|
585
|
+
manifest_entry = ParquetManifestEntry(
|
|
586
|
+
file_path=fp,
|
|
587
|
+
file_format="parquet",
|
|
588
|
+
record_count=0,
|
|
589
|
+
null_counts=[],
|
|
590
|
+
file_size_in_bytes=0,
|
|
591
|
+
uncompressed_size_in_bytes=0,
|
|
592
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
593
|
+
min_k_hashes=[],
|
|
594
|
+
histogram_counts=[],
|
|
595
|
+
histogram_bins=0,
|
|
596
|
+
min_values=[],
|
|
597
|
+
max_values=[],
|
|
598
|
+
)
|
|
599
|
+
new_entries.append(manifest_entry.to_dict())
|
|
744
600
|
|
|
745
601
|
merged_entries = prev_entries + new_entries
|
|
746
602
|
|
|
@@ -754,41 +610,43 @@ class SimpleDataset(Dataset):
|
|
|
754
610
|
# Build summary deltas
|
|
755
611
|
added_data_files = len(new_entries)
|
|
756
612
|
added_files_size = 0
|
|
613
|
+
added_data_size = 0
|
|
757
614
|
added_records = 0
|
|
615
|
+
# Sum uncompressed sizes from new entries
|
|
616
|
+
for entry in new_entries:
|
|
617
|
+
added_data_size += entry.get("uncompressed_size_in_bytes", 0)
|
|
758
618
|
deleted_data_files = 0
|
|
759
619
|
deleted_files_size = 0
|
|
620
|
+
deleted_data_size = 0
|
|
760
621
|
deleted_records = 0
|
|
761
622
|
|
|
623
|
+
prev_total_data_size = (
|
|
624
|
+
int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
|
|
625
|
+
)
|
|
626
|
+
|
|
762
627
|
total_data_files = prev_total_files + added_data_files - deleted_data_files
|
|
763
628
|
total_files_size = prev_total_size + added_files_size - deleted_files_size
|
|
629
|
+
total_data_size = prev_total_data_size + added_data_size - deleted_data_size
|
|
764
630
|
total_records = prev_total_records + added_records - deleted_records
|
|
765
631
|
|
|
766
632
|
summary = {
|
|
767
633
|
"added-data-files": added_data_files,
|
|
768
634
|
"added-files-size": added_files_size,
|
|
635
|
+
"added-data-size": added_data_size,
|
|
769
636
|
"added-records": added_records,
|
|
770
637
|
"deleted-data-files": deleted_data_files,
|
|
771
638
|
"deleted-files-size": deleted_files_size,
|
|
639
|
+
"deleted-data-size": deleted_data_size,
|
|
772
640
|
"deleted-records": deleted_records,
|
|
773
641
|
"total-data-files": total_data_files,
|
|
774
642
|
"total-files-size": total_files_size,
|
|
643
|
+
"total-data-size": total_data_size,
|
|
775
644
|
"total-records": total_records,
|
|
776
645
|
}
|
|
777
646
|
|
|
778
647
|
# Sequence number
|
|
779
648
|
try:
|
|
780
|
-
|
|
781
|
-
for s in self.metadata.snapshots:
|
|
782
|
-
seq = getattr(s, "sequence_number", None)
|
|
783
|
-
if seq is None:
|
|
784
|
-
continue
|
|
785
|
-
try:
|
|
786
|
-
ival = int(seq)
|
|
787
|
-
except Exception:
|
|
788
|
-
continue
|
|
789
|
-
if ival > max_seq:
|
|
790
|
-
max_seq = ival
|
|
791
|
-
next_seq = max_seq + 1
|
|
649
|
+
next_seq = self._next_sequence_number()
|
|
792
650
|
except Exception:
|
|
793
651
|
next_seq = 1
|
|
794
652
|
|
|
@@ -897,6 +755,7 @@ class SimpleDataset(Dataset):
|
|
|
897
755
|
ncols = pf.metadata.num_columns
|
|
898
756
|
mins = [None] * ncols
|
|
899
757
|
maxs = [None] * ncols
|
|
758
|
+
null_counts = [0] * ncols
|
|
900
759
|
for rg in range(pf.num_row_groups):
|
|
901
760
|
for ci in range(ncols):
|
|
902
761
|
col_meta = pf.metadata.row_group(rg).column(ci)
|
|
@@ -905,7 +764,8 @@ class SimpleDataset(Dataset):
|
|
|
905
764
|
continue
|
|
906
765
|
smin = getattr(stats, "min", None)
|
|
907
766
|
smax = getattr(stats, "max", None)
|
|
908
|
-
|
|
767
|
+
snull_count = getattr(stats, "null_count", None)
|
|
768
|
+
if smin is None and smax is None and snull_count is None:
|
|
909
769
|
continue
|
|
910
770
|
|
|
911
771
|
def _to_py(v):
|
|
@@ -942,6 +802,11 @@ class SimpleDataset(Dataset):
|
|
|
942
802
|
maxs[ci] = sval
|
|
943
803
|
except Exception:
|
|
944
804
|
pass
|
|
805
|
+
if snull_count is not None:
|
|
806
|
+
try:
|
|
807
|
+
null_counts[ci] += int(snull_count)
|
|
808
|
+
except Exception:
|
|
809
|
+
pass
|
|
945
810
|
|
|
946
811
|
min_values = [m for m in mins if m is not None]
|
|
947
812
|
max_values = [m for m in maxs if m is not None]
|
|
@@ -950,20 +815,23 @@ class SimpleDataset(Dataset):
|
|
|
950
815
|
record_count = 0
|
|
951
816
|
min_values = []
|
|
952
817
|
max_values = []
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
818
|
+
null_counts = []
|
|
819
|
+
|
|
820
|
+
manifest_entry = ParquetManifestEntry(
|
|
821
|
+
file_path=fp,
|
|
822
|
+
file_format="parquet",
|
|
823
|
+
record_count=int(record_count),
|
|
824
|
+
null_counts=null_counts,
|
|
825
|
+
file_size_in_bytes=int(file_size),
|
|
826
|
+
uncompressed_size_in_bytes=int(file_size), # Use compressed size as estimate
|
|
827
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
828
|
+
min_k_hashes=[],
|
|
829
|
+
histogram_counts=[],
|
|
830
|
+
histogram_bins=0,
|
|
831
|
+
min_values=min_values,
|
|
832
|
+
max_values=max_values,
|
|
966
833
|
)
|
|
834
|
+
new_entries.append(manifest_entry.to_dict())
|
|
967
835
|
|
|
968
836
|
manifest_path = None
|
|
969
837
|
if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
|
|
@@ -974,42 +842,42 @@ class SimpleDataset(Dataset):
|
|
|
974
842
|
# Build summary: previous entries become deleted
|
|
975
843
|
deleted_data_files = prev_total_files
|
|
976
844
|
deleted_files_size = prev_total_size
|
|
845
|
+
deleted_data_size = (
|
|
846
|
+
int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
|
|
847
|
+
)
|
|
977
848
|
deleted_records = prev_total_records
|
|
978
849
|
|
|
979
850
|
added_data_files = len(new_entries)
|
|
980
851
|
added_files_size = 0
|
|
852
|
+
added_data_size = 0
|
|
853
|
+
# Sum uncompressed sizes from new entries
|
|
854
|
+
for entry in new_entries:
|
|
855
|
+
added_data_size += entry.get("uncompressed_size_in_bytes", 0)
|
|
981
856
|
added_records = 0
|
|
982
857
|
|
|
983
858
|
total_data_files = added_data_files
|
|
984
859
|
total_files_size = added_files_size
|
|
860
|
+
total_data_size = added_data_size
|
|
985
861
|
total_records = added_records
|
|
986
862
|
|
|
987
863
|
summary = {
|
|
988
864
|
"added-data-files": added_data_files,
|
|
989
865
|
"added-files-size": added_files_size,
|
|
866
|
+
"added-data-size": added_data_size,
|
|
990
867
|
"added-records": added_records,
|
|
991
868
|
"deleted-data-files": deleted_data_files,
|
|
992
869
|
"deleted-files-size": deleted_files_size,
|
|
870
|
+
"deleted-data-size": deleted_data_size,
|
|
993
871
|
"deleted-records": deleted_records,
|
|
994
872
|
"total-data-files": total_data_files,
|
|
995
873
|
"total-files-size": total_files_size,
|
|
874
|
+
"total-data-size": total_data_size,
|
|
996
875
|
"total-records": total_records,
|
|
997
876
|
}
|
|
998
877
|
|
|
999
878
|
# Sequence number
|
|
1000
879
|
try:
|
|
1001
|
-
|
|
1002
|
-
for s in self.metadata.snapshots:
|
|
1003
|
-
seq = getattr(s, "sequence_number", None)
|
|
1004
|
-
if seq is None:
|
|
1005
|
-
continue
|
|
1006
|
-
try:
|
|
1007
|
-
ival = int(seq)
|
|
1008
|
-
except Exception:
|
|
1009
|
-
continue
|
|
1010
|
-
if ival > max_seq:
|
|
1011
|
-
max_seq = ival
|
|
1012
|
-
next_seq = max_seq + 1
|
|
880
|
+
next_seq = self._next_sequence_number()
|
|
1013
881
|
except Exception:
|
|
1014
882
|
next_seq = 1
|
|
1015
883
|
|
|
@@ -1042,13 +910,11 @@ class SimpleDataset(Dataset):
|
|
|
1042
910
|
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
1043
911
|
|
|
1044
912
|
def scan(
|
|
1045
|
-
self, row_filter=None,
|
|
913
|
+
self, row_filter=None, snapshot_id: Optional[int] = None
|
|
1046
914
|
) -> Iterable[Datafile]:
|
|
1047
915
|
"""Return Datafile objects for the given snapshot.
|
|
1048
916
|
|
|
1049
917
|
- If `snapshot_id` is None, use the current snapshot.
|
|
1050
|
-
- Ignore `row_filter` for now and return all files listed in the
|
|
1051
|
-
snapshot's parquet manifest (if present).
|
|
1052
918
|
"""
|
|
1053
919
|
# Determine snapshot to read using the dataset-level helper which
|
|
1054
920
|
# prefers the in-memory current snapshot and otherwise performs a
|
|
@@ -1065,8 +931,6 @@ class SimpleDataset(Dataset):
|
|
|
1065
931
|
import pyarrow as pa
|
|
1066
932
|
import pyarrow.parquet as pq
|
|
1067
933
|
|
|
1068
|
-
data = None
|
|
1069
|
-
|
|
1070
934
|
inp = self.io.new_input(manifest_path)
|
|
1071
935
|
with inp.open() as f:
|
|
1072
936
|
data = f.read()
|
|
@@ -1076,23 +940,148 @@ class SimpleDataset(Dataset):
|
|
|
1076
940
|
|
|
1077
941
|
table = pq.read_table(pa.BufferReader(data))
|
|
1078
942
|
rows = table.to_pylist()
|
|
1079
|
-
cum_rows = 0
|
|
1080
943
|
for r in rows:
|
|
1081
944
|
yield Datafile(entry=r)
|
|
1082
|
-
try:
|
|
1083
|
-
rc = int(r.get("record_count") or 0)
|
|
1084
|
-
except Exception:
|
|
1085
|
-
rc = 0
|
|
1086
|
-
cum_rows += rc
|
|
1087
|
-
if row_limit is not None and cum_rows >= row_limit:
|
|
1088
|
-
break
|
|
1089
945
|
except FileNotFoundError:
|
|
1090
946
|
return iter(())
|
|
1091
947
|
except Exception:
|
|
1092
948
|
return iter(())
|
|
1093
949
|
|
|
950
|
+
def refresh_manifest(self, agent: str, author: Optional[str] = None) -> Optional[int]:
|
|
951
|
+
"""Refresh manifest statistics and create a new snapshot.
|
|
952
|
+
|
|
953
|
+
- `agent`: identifier for the agent performing the refresh (string)
|
|
954
|
+
- `author`: optional author to record; if omitted uses current snapshot author
|
|
955
|
+
|
|
956
|
+
This recalculates per-file statistics (min/max, record counts, sizes)
|
|
957
|
+
for every file in the current manifest, writes a new manifest and
|
|
958
|
+
creates a new snapshot with `user_created=False` and
|
|
959
|
+
`operation_type='statistics-refresh'`.
|
|
960
|
+
|
|
961
|
+
Returns the new `snapshot_id` on success or None on failure.
|
|
962
|
+
"""
|
|
963
|
+
prev = self.snapshot(None)
|
|
964
|
+
if prev is None or not getattr(prev, "manifest_list", None):
|
|
965
|
+
raise ValueError("No current manifest available to refresh")
|
|
966
|
+
|
|
967
|
+
# Use same author/commit-timestamp as previous snapshot unless overridden
|
|
968
|
+
use_author = author if author is not None else getattr(prev, "author", None)
|
|
969
|
+
|
|
970
|
+
snapshot_id = int(time.time() * 1000)
|
|
971
|
+
|
|
972
|
+
# Rebuild manifest entries by re-reading each data file
|
|
973
|
+
entries = []
|
|
974
|
+
try:
|
|
975
|
+
# Read previous manifest entries
|
|
976
|
+
inp = self.io.new_input(prev.manifest_list)
|
|
977
|
+
with inp.open() as f:
|
|
978
|
+
prev_data = f.read()
|
|
979
|
+
import pyarrow as pa
|
|
980
|
+
import pyarrow.parquet as pq
|
|
981
|
+
|
|
982
|
+
prev_table = pq.read_table(pa.BufferReader(prev_data))
|
|
983
|
+
prev_rows = prev_table.to_pylist()
|
|
984
|
+
except Exception:
|
|
985
|
+
prev_rows = []
|
|
986
|
+
|
|
987
|
+
total_files = 0
|
|
988
|
+
total_size = 0
|
|
989
|
+
total_data_size = 0
|
|
990
|
+
total_records = 0
|
|
991
|
+
|
|
992
|
+
for ent in prev_rows:
|
|
993
|
+
if not isinstance(ent, dict):
|
|
994
|
+
continue
|
|
995
|
+
fp = ent.get("file_path")
|
|
996
|
+
if not fp:
|
|
997
|
+
continue
|
|
998
|
+
try:
|
|
999
|
+
inp = self.io.new_input(fp)
|
|
1000
|
+
with inp.open() as f:
|
|
1001
|
+
data = f.read()
|
|
1002
|
+
# Full statistics including histograms and k-hashes
|
|
1003
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
1004
|
+
manifest_entry = build_parquet_manifest_entry(table, fp, len(data))
|
|
1005
|
+
dent = manifest_entry.to_dict()
|
|
1006
|
+
except Exception:
|
|
1007
|
+
# Fall back to original entry if re-read fails
|
|
1008
|
+
dent = ent
|
|
1009
|
+
|
|
1010
|
+
entries.append(dent)
|
|
1011
|
+
total_files += 1
|
|
1012
|
+
total_size += int(dent.get("file_size_in_bytes") or 0)
|
|
1013
|
+
total_data_size += int(dent.get("uncompressed_size_in_bytes") or 0)
|
|
1014
|
+
total_records += int(dent.get("record_count") or 0)
|
|
1015
|
+
|
|
1016
|
+
# write new manifest
|
|
1017
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
1018
|
+
snapshot_id, entries, self.metadata.location
|
|
1019
|
+
)
|
|
1020
|
+
|
|
1021
|
+
# Build summary
|
|
1022
|
+
summary = {
|
|
1023
|
+
"added-data-files": 0,
|
|
1024
|
+
"added-files-size": 0,
|
|
1025
|
+
"added-data-size": 0,
|
|
1026
|
+
"added-records": 0,
|
|
1027
|
+
"deleted-data-files": 0,
|
|
1028
|
+
"deleted-files-size": 0,
|
|
1029
|
+
"deleted-data-size": 0,
|
|
1030
|
+
"deleted-records": 0,
|
|
1031
|
+
"total-data-files": total_files,
|
|
1032
|
+
"total-files-size": total_size,
|
|
1033
|
+
"total-data-size": total_data_size,
|
|
1034
|
+
"total-records": total_records,
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
# sequence number
|
|
1038
|
+
try:
|
|
1039
|
+
next_seq = self._next_sequence_number()
|
|
1040
|
+
except Exception:
|
|
1041
|
+
next_seq = 1
|
|
1042
|
+
|
|
1043
|
+
parent_id = self.metadata.current_snapshot_id
|
|
1044
|
+
|
|
1045
|
+
# Agent committer metadata
|
|
1046
|
+
agent_meta = {
|
|
1047
|
+
"timestamp": int(time.time() * 1000),
|
|
1048
|
+
"action": "statistics-refresh",
|
|
1049
|
+
"agent": agent,
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
snap = Snapshot(
|
|
1053
|
+
snapshot_id=snapshot_id,
|
|
1054
|
+
timestamp_ms=getattr(prev, "timestamp_ms", snapshot_id),
|
|
1055
|
+
author=use_author,
|
|
1056
|
+
sequence_number=next_seq,
|
|
1057
|
+
user_created=False,
|
|
1058
|
+
operation_type="statistics-refresh",
|
|
1059
|
+
parent_snapshot_id=parent_id,
|
|
1060
|
+
manifest_list=manifest_path,
|
|
1061
|
+
schema_id=self.metadata.current_schema_id,
|
|
1062
|
+
commit_message=getattr(prev, "commit_message", "statistics refresh"),
|
|
1063
|
+
summary=summary,
|
|
1064
|
+
)
|
|
1065
|
+
|
|
1066
|
+
# attach agent metadata under summary
|
|
1067
|
+
if snap.summary is None:
|
|
1068
|
+
snap.summary = {}
|
|
1069
|
+
snap.summary["agent-committer"] = agent_meta
|
|
1070
|
+
|
|
1071
|
+
# update in-memory metadata
|
|
1072
|
+
self.metadata.snapshots.append(snap)
|
|
1073
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
1074
|
+
|
|
1075
|
+
# persist
|
|
1076
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
1077
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
1078
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
1079
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
1080
|
+
|
|
1081
|
+
return snapshot_id
|
|
1082
|
+
|
|
1094
1083
|
def truncate(self, author: str = None, commit_message: Optional[str] = None) -> None:
|
|
1095
|
-
"""Delete all data files and manifests for this
|
|
1084
|
+
"""Delete all data files and manifests for this dataset.
|
|
1096
1085
|
|
|
1097
1086
|
This attempts to delete every data file referenced by existing
|
|
1098
1087
|
Parquet manifests and then delete the manifest files themselves.
|
|
@@ -1109,6 +1098,7 @@ class SimpleDataset(Dataset):
|
|
|
1109
1098
|
snaps = list(self.metadata.snapshots)
|
|
1110
1099
|
removed_files = []
|
|
1111
1100
|
removed_total_size = 0
|
|
1101
|
+
removed_data_size = 0
|
|
1112
1102
|
|
|
1113
1103
|
for snap in snaps:
|
|
1114
1104
|
manifest_path = getattr(snap, "manifest_list", None)
|
|
@@ -1118,31 +1108,34 @@ class SimpleDataset(Dataset):
|
|
|
1118
1108
|
# Read manifest via FileIO if available
|
|
1119
1109
|
rows = []
|
|
1120
1110
|
try:
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
rows = table.to_pylist()
|
|
1111
|
+
inp = io.new_input(manifest_path)
|
|
1112
|
+
with inp.open() as f:
|
|
1113
|
+
data = f.read()
|
|
1114
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
1115
|
+
rows = table.to_pylist()
|
|
1127
1116
|
except Exception:
|
|
1128
1117
|
rows = []
|
|
1129
1118
|
|
|
1130
1119
|
for r in rows:
|
|
1131
1120
|
fp = None
|
|
1132
1121
|
fsize = 0
|
|
1122
|
+
data_size = 0
|
|
1133
1123
|
if isinstance(r, dict):
|
|
1134
1124
|
fp = r.get("file_path")
|
|
1135
1125
|
fsize = int(r.get("file_size_in_bytes") or 0)
|
|
1126
|
+
data_size = int(r.get("uncompressed_size_in_bytes") or 0)
|
|
1136
1127
|
if not fp and "data_file" in r and isinstance(r["data_file"], dict):
|
|
1137
1128
|
fp = r["data_file"].get("file_path") or r["data_file"].get("path")
|
|
1138
1129
|
fsize = int(r["data_file"].get("file_size_in_bytes") or 0)
|
|
1130
|
+
data_size = int(r["data_file"].get("uncompressed_size_in_bytes") or 0)
|
|
1139
1131
|
|
|
1140
1132
|
if fp:
|
|
1141
1133
|
removed_files.append(fp)
|
|
1142
1134
|
removed_total_size += fsize
|
|
1135
|
+
removed_data_size += data_size
|
|
1143
1136
|
|
|
1144
1137
|
# Create a new empty Parquet manifest (entries=[]) to represent the
|
|
1145
|
-
# truncated
|
|
1138
|
+
# truncated dataset for the new snapshot. Do not delete objects.
|
|
1146
1139
|
snapshot_id = int(time.time() * 1000)
|
|
1147
1140
|
|
|
1148
1141
|
# Do NOT write an empty Parquet manifest when there are no entries.
|
|
@@ -1157,29 +1150,21 @@ class SimpleDataset(Dataset):
|
|
|
1157
1150
|
summary = {
|
|
1158
1151
|
"added-data-files": 0,
|
|
1159
1152
|
"added-files-size": 0,
|
|
1153
|
+
"added-data-size": 0,
|
|
1160
1154
|
"added-records": 0,
|
|
1161
1155
|
"deleted-data-files": deleted_count,
|
|
1162
1156
|
"deleted-files-size": deleted_size,
|
|
1157
|
+
"deleted-data-size": removed_data_size,
|
|
1163
1158
|
"deleted-records": 0,
|
|
1164
1159
|
"total-data-files": 0,
|
|
1165
1160
|
"total-files-size": 0,
|
|
1161
|
+
"total-data-size": 0,
|
|
1166
1162
|
"total-records": 0,
|
|
1167
1163
|
}
|
|
1168
1164
|
|
|
1169
1165
|
# Sequence number
|
|
1170
1166
|
try:
|
|
1171
|
-
|
|
1172
|
-
for s in self.metadata.snapshots:
|
|
1173
|
-
seq = getattr(s, "sequence_number", None)
|
|
1174
|
-
if seq is None:
|
|
1175
|
-
continue
|
|
1176
|
-
try:
|
|
1177
|
-
ival = int(seq)
|
|
1178
|
-
except Exception:
|
|
1179
|
-
continue
|
|
1180
|
-
if ival > max_seq:
|
|
1181
|
-
max_seq = ival
|
|
1182
|
-
next_seq = max_seq + 1
|
|
1167
|
+
next_seq = self._next_sequence_number()
|
|
1183
1168
|
except Exception:
|
|
1184
1169
|
next_seq = 1
|
|
1185
1170
|
|
|
@@ -1215,7 +1200,4 @@ class SimpleDataset(Dataset):
|
|
|
1215
1200
|
self.metadata.current_snapshot_id = snapshot_id
|
|
1216
1201
|
|
|
1217
1202
|
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
1218
|
-
|
|
1219
|
-
self.catalog.save_snapshot(self.identifier, snap)
|
|
1220
|
-
except Exception:
|
|
1221
|
-
pass
|
|
1203
|
+
self.catalog.save_snapshot(self.identifier, snap)
|