opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/__init__.py +1 -1
- opteryx_catalog/catalog/__init__.py +2 -1
- opteryx_catalog/catalog/compaction.py +536 -0
- opteryx_catalog/catalog/dataset.py +840 -520
- opteryx_catalog/catalog/manifest.py +475 -0
- opteryx_catalog/catalog/metadata.py +5 -2
- opteryx_catalog/catalog/metastore.py +2 -2
- opteryx_catalog/exceptions.py +1 -1
- opteryx_catalog/iops/fileio.py +13 -0
- opteryx_catalog/iops/gcs.py +35 -5
- opteryx_catalog/maki_nage/__init__.py +8 -0
- opteryx_catalog/maki_nage/distogram.py +558 -0
- opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
- opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
- opteryx_catalog/maki_nage/tests/test_count.py +19 -0
- opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
- opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
- opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
- opteryx_catalog/maki_nage/tests/test_update.py +44 -0
- opteryx_catalog/opteryx_catalog.py +296 -242
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
- opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
- scripts/collect_byte_counts.py +42 -0
- scripts/create_dataset.py +1 -1
- scripts/emit_full_single_file.py +81 -0
- scripts/inspect_manifest_dryrun.py +322 -0
- scripts/inspect_single_file.py +147 -0
- scripts/inspect_single_file_gcs.py +124 -0
- scripts/read_dataset.py +1 -1
- tests/test_collections.py +37 -0
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +14 -0
- tests/test_describe_uncompressed.py +127 -0
- tests/test_refresh_manifest.py +275 -0
- tests/test_webhooks.py +177 -0
- opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
|
@@ -8,6 +8,8 @@ from typing import Any
|
|
|
8
8
|
from typing import Iterable
|
|
9
9
|
from typing import Optional
|
|
10
10
|
|
|
11
|
+
from .manifest import ParquetManifestEntry
|
|
12
|
+
from .manifest import build_parquet_manifest_entry_from_bytes
|
|
11
13
|
from .metadata import DatasetMetadata
|
|
12
14
|
from .metadata import Snapshot
|
|
13
15
|
from .metastore import Dataset
|
|
@@ -69,6 +71,26 @@ class SimpleDataset(Dataset):
|
|
|
69
71
|
def metadata(self) -> DatasetMetadata:
|
|
70
72
|
return self._metadata
|
|
71
73
|
|
|
74
|
+
def _next_sequence_number(self) -> int:
|
|
75
|
+
"""Calculate the next sequence number.
|
|
76
|
+
|
|
77
|
+
Uses the current snapshot's sequence number + 1. Works efficiently
|
|
78
|
+
with load_history=False since we only need the most recent snapshot,
|
|
79
|
+
not the full history.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
The next sequence number (current snapshot's sequence + 1, or 1 if no snapshots).
|
|
83
|
+
"""
|
|
84
|
+
if not self.metadata.snapshots:
|
|
85
|
+
# No snapshots yet - this is the first one
|
|
86
|
+
return 1
|
|
87
|
+
|
|
88
|
+
# Get the current (most recent) snapshot - should have the highest sequence number
|
|
89
|
+
current = self.snapshot()
|
|
90
|
+
if current:
|
|
91
|
+
seq = getattr(current, "sequence_number", None)
|
|
92
|
+
return int(seq) + 1 if seq is not None else 1
|
|
93
|
+
|
|
72
94
|
def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Snapshot]:
|
|
73
95
|
"""Return a Snapshot.
|
|
74
96
|
|
|
@@ -95,20 +117,17 @@ class SimpleDataset(Dataset):
|
|
|
95
117
|
if doc.exists:
|
|
96
118
|
sd = doc.to_dict() or {}
|
|
97
119
|
snap = Snapshot(
|
|
98
|
-
snapshot_id=int(
|
|
99
|
-
|
|
100
|
-
),
|
|
101
|
-
timestamp_ms=int(sd.get("timestamp-ms") or sd.get("timestamp_ms") or 0),
|
|
120
|
+
snapshot_id=int(sd.get("snapshot-id") or snapshot_id),
|
|
121
|
+
timestamp_ms=int(sd.get("timestamp-ms", 0)),
|
|
102
122
|
author=sd.get("author"),
|
|
103
|
-
sequence_number=sd.get("sequence-number"
|
|
104
|
-
user_created=sd.get("user-created")
|
|
105
|
-
manifest_list=sd.get("manifest")
|
|
106
|
-
schema_id=sd.get("schema-id")
|
|
123
|
+
sequence_number=sd.get("sequence-number", 0),
|
|
124
|
+
user_created=sd.get("user-created"),
|
|
125
|
+
manifest_list=sd.get("manifest"),
|
|
126
|
+
schema_id=sd.get("schema-id"),
|
|
107
127
|
summary=sd.get("summary", {}),
|
|
108
|
-
operation_type=sd.get("operation-type")
|
|
109
|
-
parent_snapshot_id=sd.get("parent-snapshot-id")
|
|
110
|
-
|
|
111
|
-
commit_message=sd.get("commit-message") or sd.get("commit_message"),
|
|
128
|
+
operation_type=sd.get("operation-type"),
|
|
129
|
+
parent_snapshot_id=sd.get("parent-snapshot-id"),
|
|
130
|
+
commit_message=sd.get("commit-message"),
|
|
112
131
|
)
|
|
113
132
|
return snap
|
|
114
133
|
except Exception:
|
|
@@ -227,148 +246,9 @@ class SimpleDataset(Dataset):
|
|
|
227
246
|
if not hasattr(table, "schema"):
|
|
228
247
|
raise TypeError("append() expects a pyarrow.Table-like object")
|
|
229
248
|
|
|
230
|
-
# Write
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
buf = pa.BufferOutputStream()
|
|
234
|
-
pq.write_table(table, buf, compression="zstd")
|
|
235
|
-
pdata = buf.getvalue().to_pybytes()
|
|
236
|
-
|
|
237
|
-
out = self.io.new_output(data_path).create()
|
|
238
|
-
out.write(pdata)
|
|
239
|
-
out.close()
|
|
240
|
-
|
|
241
|
-
# Prepare sketches/stats
|
|
242
|
-
K = 32
|
|
243
|
-
HBINS = 32
|
|
244
|
-
min_k_hashes: list[list[int]] = []
|
|
245
|
-
histograms: list[list[int]] = []
|
|
246
|
-
min_values: list[int] = []
|
|
247
|
-
max_values: list[int] = []
|
|
248
|
-
|
|
249
|
-
# Use draken for efficient hashing and compression when available.
|
|
250
|
-
import heapq
|
|
251
|
-
|
|
252
|
-
# canonical NULL flag for missing values
|
|
253
|
-
NULL_FLAG = -(1 << 63)
|
|
254
|
-
|
|
255
|
-
try:
|
|
256
|
-
import opteryx.draken as draken # type: ignore
|
|
257
|
-
|
|
258
|
-
num_rows = int(table.num_rows)
|
|
259
|
-
|
|
260
|
-
for col_idx, col in enumerate(table.columns):
|
|
261
|
-
# hash column values to 64-bit via draken (new cpdef API)
|
|
262
|
-
vec = draken.Vector.from_arrow(col)
|
|
263
|
-
hashes = list(vec.hash())
|
|
264
|
-
|
|
265
|
-
# Decide whether to compute min-k/histogram for this column based
|
|
266
|
-
# on field type and, for strings, average length of values.
|
|
267
|
-
field_type = table.schema.field(col_idx).type
|
|
268
|
-
compute_min_k = False
|
|
269
|
-
if (
|
|
270
|
-
pa.types.is_integer(field_type)
|
|
271
|
-
or pa.types.is_floating(field_type)
|
|
272
|
-
or pa.types.is_decimal(field_type)
|
|
273
|
-
):
|
|
274
|
-
compute_min_k = True
|
|
275
|
-
elif (
|
|
276
|
-
pa.types.is_timestamp(field_type)
|
|
277
|
-
or pa.types.is_date(field_type)
|
|
278
|
-
or pa.types.is_time(field_type)
|
|
279
|
-
):
|
|
280
|
-
compute_min_k = True
|
|
281
|
-
elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
|
|
282
|
-
# compute average length from non-null values; only allow
|
|
283
|
-
# min-k/histogram for short strings (avg <= 16)
|
|
284
|
-
col_py = None
|
|
285
|
-
try:
|
|
286
|
-
col_py = col.to_pylist()
|
|
287
|
-
except Exception:
|
|
288
|
-
col_py = None
|
|
289
|
-
|
|
290
|
-
if col_py is not None:
|
|
291
|
-
lens = [len(x) for x in col_py if x is not None]
|
|
292
|
-
if lens:
|
|
293
|
-
avg_len = sum(lens) / len(lens)
|
|
294
|
-
if avg_len <= 16:
|
|
295
|
-
compute_min_k = True
|
|
296
|
-
|
|
297
|
-
# KMV: take K smallest hashes when allowed; otherwise store an
|
|
298
|
-
# empty list for this column.
|
|
299
|
-
if compute_min_k:
|
|
300
|
-
smallest = heapq.nsmallest(K, hashes)
|
|
301
|
-
col_min_k = sorted(smallest)
|
|
302
|
-
else:
|
|
303
|
-
col_min_k = []
|
|
304
|
-
|
|
305
|
-
# For histogram decisions follow the same rule as min-k
|
|
306
|
-
compute_hist = compute_min_k
|
|
307
|
-
|
|
308
|
-
# Use draken.compress() to get canonical int64 per value
|
|
309
|
-
mapped = list(vec.compress())
|
|
310
|
-
non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
|
|
311
|
-
if non_nulls_mapped:
|
|
312
|
-
vmin = min(non_nulls_mapped)
|
|
313
|
-
vmax = max(non_nulls_mapped)
|
|
314
|
-
col_min = int(vmin)
|
|
315
|
-
col_max = int(vmax)
|
|
316
|
-
if compute_hist:
|
|
317
|
-
if vmin == vmax:
|
|
318
|
-
col_hist = [0] * HBINS
|
|
319
|
-
col_hist[-1] = len(non_nulls_mapped)
|
|
320
|
-
else:
|
|
321
|
-
col_hist = [0] * HBINS
|
|
322
|
-
span = float(vmax - vmin)
|
|
323
|
-
for m in non_nulls_mapped:
|
|
324
|
-
b = int(((float(m) - float(vmin)) / span) * (HBINS - 1))
|
|
325
|
-
if b < 0:
|
|
326
|
-
b = 0
|
|
327
|
-
if b >= HBINS:
|
|
328
|
-
b = HBINS - 1
|
|
329
|
-
col_hist[b] += 1
|
|
330
|
-
else:
|
|
331
|
-
col_hist = [0] * HBINS
|
|
332
|
-
else:
|
|
333
|
-
# no non-null values; histogram via hash buckets
|
|
334
|
-
col_min = NULL_FLAG
|
|
335
|
-
col_max = NULL_FLAG
|
|
336
|
-
if compute_hist:
|
|
337
|
-
col_hist = [0] * HBINS
|
|
338
|
-
for h in hashes:
|
|
339
|
-
b = (h >> (64 - 5)) & 0x1F
|
|
340
|
-
col_hist[b] += 1
|
|
341
|
-
else:
|
|
342
|
-
col_hist = [0] * HBINS
|
|
343
|
-
|
|
344
|
-
min_k_hashes.append(col_min_k)
|
|
345
|
-
histograms.append(col_hist)
|
|
346
|
-
min_values.append(col_min)
|
|
347
|
-
max_values.append(col_max)
|
|
348
|
-
except Exception:
|
|
349
|
-
# If draken or its dependencies are unavailable, fall back to
|
|
350
|
-
# conservative defaults so we can still write the manifest and
|
|
351
|
-
# snapshot without failing the append operation.
|
|
352
|
-
num_cols = table.num_columns
|
|
353
|
-
min_k_hashes = [[] for _ in range(num_cols)]
|
|
354
|
-
HBINS = 32
|
|
355
|
-
histograms = [[0] * HBINS for _ in range(num_cols)]
|
|
356
|
-
min_values = [NULL_FLAG] * num_cols
|
|
357
|
-
max_values = [NULL_FLAG] * num_cols
|
|
358
|
-
|
|
359
|
-
entries = [
|
|
360
|
-
{
|
|
361
|
-
"file_path": data_path,
|
|
362
|
-
"file_format": "parquet",
|
|
363
|
-
"record_count": int(table.num_rows),
|
|
364
|
-
"file_size_in_bytes": len(pdata),
|
|
365
|
-
"min_k_hashes": min_k_hashes,
|
|
366
|
-
"histogram_counts": histograms,
|
|
367
|
-
"histogram_bins": HBINS,
|
|
368
|
-
"min_values": min_values,
|
|
369
|
-
"max_values": max_values,
|
|
370
|
-
}
|
|
371
|
-
]
|
|
249
|
+
# Write table and build manifest entry
|
|
250
|
+
manifest_entry = self._write_table_and_build_entry(table)
|
|
251
|
+
entries = [manifest_entry.to_dict()]
|
|
372
252
|
|
|
373
253
|
# persist manifest: for append, merge previous manifest entries
|
|
374
254
|
# with the new entries so the snapshot's manifest is cumulative.
|
|
@@ -384,35 +264,15 @@ class SimpleDataset(Dataset):
|
|
|
384
264
|
prev_manifest_path = prev_snap.manifest_list
|
|
385
265
|
try:
|
|
386
266
|
# Prefer FileIO when available
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
merged_entries = prev_rows + merged_entries
|
|
397
|
-
else:
|
|
398
|
-
# Fall back to catalog storage client (GCS)
|
|
399
|
-
if (
|
|
400
|
-
self.catalog
|
|
401
|
-
and getattr(self.catalog, "_storage_client", None)
|
|
402
|
-
and getattr(self.catalog, "gcs_bucket", None)
|
|
403
|
-
):
|
|
404
|
-
bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
|
|
405
|
-
parsed = prev_manifest_path
|
|
406
|
-
if parsed.startswith("gs://"):
|
|
407
|
-
parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
|
|
408
|
-
blob = bucket.blob(parsed)
|
|
409
|
-
prev_data = blob.download_as_bytes()
|
|
410
|
-
import pyarrow as pa
|
|
411
|
-
import pyarrow.parquet as pq
|
|
412
|
-
|
|
413
|
-
prev_table = pq.read_table(pa.BufferReader(prev_data))
|
|
414
|
-
prev_rows = prev_table.to_pylist()
|
|
415
|
-
merged_entries = prev_rows + merged_entries
|
|
267
|
+
inp = self.io.new_input(prev_manifest_path)
|
|
268
|
+
with inp.open() as f:
|
|
269
|
+
prev_data = f.read()
|
|
270
|
+
import pyarrow as pa
|
|
271
|
+
import pyarrow.parquet as pq
|
|
272
|
+
|
|
273
|
+
prev_table = pq.read_table(pa.BufferReader(prev_data))
|
|
274
|
+
prev_rows = prev_table.to_pylist()
|
|
275
|
+
merged_entries = prev_rows + merged_entries
|
|
416
276
|
except Exception:
|
|
417
277
|
# If we can't read the previous manifest, continue with
|
|
418
278
|
# just the new entries (don't fail the append).
|
|
@@ -433,63 +293,52 @@ class SimpleDataset(Dataset):
|
|
|
433
293
|
commit_message = f"commit by {author}"
|
|
434
294
|
|
|
435
295
|
recs = int(table.num_rows)
|
|
436
|
-
fsize =
|
|
296
|
+
fsize = int(getattr(manifest_entry, "file_size_in_bytes", 0))
|
|
297
|
+
# Calculate uncompressed size from the manifest entry
|
|
298
|
+
added_data_size = manifest_entry.uncompressed_size_in_bytes
|
|
437
299
|
added_data_files = 1
|
|
438
300
|
added_files_size = fsize
|
|
439
301
|
added_records = recs
|
|
440
302
|
deleted_data_files = 0
|
|
441
303
|
deleted_files_size = 0
|
|
304
|
+
deleted_data_size = 0
|
|
442
305
|
deleted_records = 0
|
|
443
306
|
|
|
444
307
|
prev = self.snapshot()
|
|
445
308
|
if prev and prev.summary:
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
try:
|
|
451
|
-
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
452
|
-
except Exception:
|
|
453
|
-
prev_total_size = 0
|
|
454
|
-
try:
|
|
455
|
-
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
456
|
-
except Exception:
|
|
457
|
-
prev_total_records = 0
|
|
309
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
310
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
311
|
+
prev_total_data_size = int(prev.summary.get("total-data-size", 0))
|
|
312
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
458
313
|
else:
|
|
459
314
|
prev_total_files = 0
|
|
460
315
|
prev_total_size = 0
|
|
316
|
+
prev_total_data_size = 0
|
|
461
317
|
prev_total_records = 0
|
|
462
318
|
|
|
463
319
|
total_data_files = prev_total_files + added_data_files - deleted_data_files
|
|
464
320
|
total_files_size = prev_total_size + added_files_size - deleted_files_size
|
|
321
|
+
total_data_size = prev_total_data_size + added_data_size - deleted_data_size
|
|
465
322
|
total_records = prev_total_records + added_records - deleted_records
|
|
466
323
|
|
|
467
324
|
summary = {
|
|
468
325
|
"added-data-files": added_data_files,
|
|
469
326
|
"added-files-size": added_files_size,
|
|
327
|
+
"added-data-size": added_data_size,
|
|
470
328
|
"added-records": added_records,
|
|
471
329
|
"deleted-data-files": deleted_data_files,
|
|
472
330
|
"deleted-files-size": deleted_files_size,
|
|
331
|
+
"deleted-data-size": deleted_data_size,
|
|
473
332
|
"deleted-records": deleted_records,
|
|
474
333
|
"total-data-files": total_data_files,
|
|
475
334
|
"total-files-size": total_files_size,
|
|
335
|
+
"total-data-size": total_data_size,
|
|
476
336
|
"total-records": total_records,
|
|
477
337
|
}
|
|
478
338
|
|
|
479
339
|
# sequence number
|
|
480
340
|
try:
|
|
481
|
-
|
|
482
|
-
for s in self.metadata.snapshots:
|
|
483
|
-
seq = getattr(s, "sequence_number", None)
|
|
484
|
-
if seq is None:
|
|
485
|
-
continue
|
|
486
|
-
try:
|
|
487
|
-
ival = int(seq)
|
|
488
|
-
except Exception:
|
|
489
|
-
continue
|
|
490
|
-
if ival > max_seq:
|
|
491
|
-
max_seq = ival
|
|
492
|
-
next_seq = max_seq + 1
|
|
341
|
+
next_seq = self._next_sequence_number()
|
|
493
342
|
except Exception:
|
|
494
343
|
next_seq = 1
|
|
495
344
|
|
|
@@ -518,6 +367,140 @@ class SimpleDataset(Dataset):
|
|
|
518
367
|
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
519
368
|
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
520
369
|
|
|
370
|
+
def _write_table_and_build_entry(self, table: Any):
|
|
371
|
+
"""Write a PyArrow table to storage and return a ParquetManifestEntry.
|
|
372
|
+
|
|
373
|
+
This centralizes the IO and manifest construction so other operations
|
|
374
|
+
(e.g. `overwrite`) can reuse the same behavior as `append`.
|
|
375
|
+
"""
|
|
376
|
+
# Write parquet file with collision-resistant name
|
|
377
|
+
fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
|
|
378
|
+
data_path = f"{self.metadata.location}/data/{fname}"
|
|
379
|
+
|
|
380
|
+
import pyarrow as pa
|
|
381
|
+
import pyarrow.parquet as pq
|
|
382
|
+
|
|
383
|
+
from ..iops.fileio import WRITE_PARQUET_OPTIONS
|
|
384
|
+
|
|
385
|
+
buf = pa.BufferOutputStream()
|
|
386
|
+
pq.write_table(table, buf, **WRITE_PARQUET_OPTIONS)
|
|
387
|
+
pdata = buf.getvalue().to_pybytes()
|
|
388
|
+
|
|
389
|
+
out = self.io.new_output(data_path).create()
|
|
390
|
+
out.write(pdata)
|
|
391
|
+
out.close()
|
|
392
|
+
|
|
393
|
+
# Build manifest entry with statistics using a bytes-based, per-column scan
|
|
394
|
+
manifest_entry = build_parquet_manifest_entry_from_bytes(
|
|
395
|
+
pdata, data_path, len(pdata), orig_table=table
|
|
396
|
+
)
|
|
397
|
+
return manifest_entry
|
|
398
|
+
|
|
399
|
+
def overwrite(self, table: Any, author: str = None, commit_message: Optional[str] = None):
|
|
400
|
+
"""Replace the dataset entirely with `table` in a single snapshot.
|
|
401
|
+
|
|
402
|
+
Semantics:
|
|
403
|
+
- Write the provided table as new data file(s)
|
|
404
|
+
- Create a new parquet manifest that contains only the new entries
|
|
405
|
+
- Create a snapshot that records previous files as deleted and the
|
|
406
|
+
new files as added (logical replace)
|
|
407
|
+
"""
|
|
408
|
+
# Similar validation as append
|
|
409
|
+
snapshot_id = int(time.time() * 1000)
|
|
410
|
+
|
|
411
|
+
if not hasattr(table, "schema"):
|
|
412
|
+
raise TypeError("overwrite() expects a pyarrow.Table-like object")
|
|
413
|
+
|
|
414
|
+
if author is None:
|
|
415
|
+
raise ValueError("author must be provided when overwriting a dataset")
|
|
416
|
+
|
|
417
|
+
# Write new data and build manifest entries (single table -> single entry)
|
|
418
|
+
manifest_entry = self._write_table_and_build_entry(table)
|
|
419
|
+
new_entries = [manifest_entry.to_dict()]
|
|
420
|
+
|
|
421
|
+
# Write manifest containing only the new entries
|
|
422
|
+
manifest_path = None
|
|
423
|
+
if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
|
|
424
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
425
|
+
snapshot_id, new_entries, self.metadata.location
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Compute deltas: previous manifest becomes deleted
|
|
429
|
+
prev = self.snapshot(None)
|
|
430
|
+
prev_total_files = 0
|
|
431
|
+
prev_total_size = 0
|
|
432
|
+
prev_total_data_size = 0
|
|
433
|
+
prev_total_records = 0
|
|
434
|
+
if prev and prev.summary:
|
|
435
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
436
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
437
|
+
prev_total_data_size = int(prev.summary.get("total-data-size", 0))
|
|
438
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
439
|
+
|
|
440
|
+
deleted_data_files = prev_total_files
|
|
441
|
+
deleted_files_size = prev_total_size
|
|
442
|
+
deleted_data_size = prev_total_data_size
|
|
443
|
+
deleted_records = prev_total_records
|
|
444
|
+
|
|
445
|
+
added_data_files = len(new_entries)
|
|
446
|
+
added_files_size = sum(e.get("file_size_in_bytes", 0) for e in new_entries)
|
|
447
|
+
added_data_size = sum(e.get("uncompressed_size_in_bytes", 0) for e in new_entries)
|
|
448
|
+
added_records = sum(e.get("record_count", 0) for e in new_entries)
|
|
449
|
+
|
|
450
|
+
total_data_files = added_data_files
|
|
451
|
+
total_files_size = added_files_size
|
|
452
|
+
total_data_size = added_data_size
|
|
453
|
+
total_records = added_records
|
|
454
|
+
|
|
455
|
+
summary = {
|
|
456
|
+
"added-data-files": added_data_files,
|
|
457
|
+
"added-files-size": added_files_size,
|
|
458
|
+
"added-data-size": added_data_size,
|
|
459
|
+
"added-records": added_records,
|
|
460
|
+
"deleted-data-files": deleted_data_files,
|
|
461
|
+
"deleted-files-size": deleted_files_size,
|
|
462
|
+
"deleted-data-size": deleted_data_size,
|
|
463
|
+
"deleted-records": deleted_records,
|
|
464
|
+
"total-data-files": total_data_files,
|
|
465
|
+
"total-files-size": total_files_size,
|
|
466
|
+
"total-data-size": total_data_size,
|
|
467
|
+
"total-records": total_records,
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
# sequence number
|
|
471
|
+
try:
|
|
472
|
+
next_seq = self._next_sequence_number()
|
|
473
|
+
except Exception:
|
|
474
|
+
next_seq = 1
|
|
475
|
+
|
|
476
|
+
parent_id = self.metadata.current_snapshot_id
|
|
477
|
+
|
|
478
|
+
if commit_message is None:
|
|
479
|
+
commit_message = f"overwrite by {author}"
|
|
480
|
+
|
|
481
|
+
snap = Snapshot(
|
|
482
|
+
snapshot_id=snapshot_id,
|
|
483
|
+
timestamp_ms=snapshot_id,
|
|
484
|
+
author=author,
|
|
485
|
+
sequence_number=next_seq,
|
|
486
|
+
user_created=True,
|
|
487
|
+
operation_type="overwrite",
|
|
488
|
+
parent_snapshot_id=parent_id,
|
|
489
|
+
manifest_list=manifest_path,
|
|
490
|
+
schema_id=self.metadata.current_schema_id,
|
|
491
|
+
commit_message=commit_message,
|
|
492
|
+
summary=summary,
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
# Replace in-memory snapshots
|
|
496
|
+
self.metadata.snapshots.append(snap)
|
|
497
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
498
|
+
|
|
499
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
500
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
501
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
502
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
503
|
+
|
|
521
504
|
def add_files(self, files: list[str], author: str = None, commit_message: Optional[str] = None):
|
|
522
505
|
"""Add filenames to the dataset manifest without writing the files.
|
|
523
506
|
|
|
@@ -540,45 +523,20 @@ class SimpleDataset(Dataset):
|
|
|
540
523
|
prev_total_records = 0
|
|
541
524
|
prev_entries = []
|
|
542
525
|
if prev and prev.summary:
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
prev_total_files = 0
|
|
547
|
-
try:
|
|
548
|
-
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
549
|
-
except Exception:
|
|
550
|
-
prev_total_size = 0
|
|
551
|
-
try:
|
|
552
|
-
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
553
|
-
except Exception:
|
|
554
|
-
prev_total_records = 0
|
|
555
|
-
|
|
526
|
+
prev_total_files = int(prev.summary.get("total-data-files", 0))
|
|
527
|
+
prev_total_size = int(prev.summary.get("total-files-size", 0))
|
|
528
|
+
prev_total_records = int(prev.summary.get("total-records", 0))
|
|
556
529
|
if prev and getattr(prev, "manifest_list", None):
|
|
557
530
|
# try to read prev manifest entries
|
|
558
531
|
try:
|
|
559
532
|
import pyarrow as pa
|
|
560
533
|
import pyarrow.parquet as pq
|
|
561
534
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
prev_entries = table.to_pylist()
|
|
568
|
-
else:
|
|
569
|
-
if (
|
|
570
|
-
self.catalog
|
|
571
|
-
and getattr(self.catalog, "_storage_client", None)
|
|
572
|
-
and getattr(self.catalog, "gcs_bucket", None)
|
|
573
|
-
):
|
|
574
|
-
bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
|
|
575
|
-
parsed = prev.manifest_list
|
|
576
|
-
if parsed.startswith("gs://"):
|
|
577
|
-
parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
|
|
578
|
-
blob = bucket.blob(parsed)
|
|
579
|
-
data = blob.download_as_bytes()
|
|
580
|
-
table = pq.read_table(pa.BufferReader(data))
|
|
581
|
-
prev_entries = table.to_pylist()
|
|
535
|
+
inp = self.io.new_input(prev.manifest_list)
|
|
536
|
+
with inp.open() as f:
|
|
537
|
+
data = f.read()
|
|
538
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
539
|
+
prev_entries = table.to_pylist()
|
|
582
540
|
except Exception:
|
|
583
541
|
prev_entries = []
|
|
584
542
|
|
|
@@ -587,9 +545,7 @@ class SimpleDataset(Dataset):
|
|
|
587
545
|
}
|
|
588
546
|
|
|
589
547
|
# Build new entries for files that don't already exist. Only accept
|
|
590
|
-
# Parquet files and
|
|
591
|
-
# row count, per-column min/max) from the Parquet footer when
|
|
592
|
-
# available.
|
|
548
|
+
# Parquet files and compute full statistics for each file.
|
|
593
549
|
new_entries = []
|
|
594
550
|
seen = set()
|
|
595
551
|
for fp in files:
|
|
@@ -600,147 +556,52 @@ class SimpleDataset(Dataset):
|
|
|
600
556
|
continue
|
|
601
557
|
seen.add(fp)
|
|
602
558
|
|
|
603
|
-
#
|
|
604
|
-
file_size = 0
|
|
605
|
-
record_count = 0
|
|
606
|
-
min_values = []
|
|
607
|
-
max_values = []
|
|
559
|
+
# Read file and compute full statistics
|
|
608
560
|
try:
|
|
609
561
|
import pyarrow as pa
|
|
610
562
|
import pyarrow.parquet as pq
|
|
611
563
|
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
with inp.open() as f:
|
|
616
|
-
data = f.read()
|
|
617
|
-
else:
|
|
618
|
-
if (
|
|
619
|
-
self.catalog
|
|
620
|
-
and getattr(self.catalog, "_storage_client", None)
|
|
621
|
-
and getattr(self.catalog, "gcs_bucket", None)
|
|
622
|
-
):
|
|
623
|
-
bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
|
|
624
|
-
parsed = fp
|
|
625
|
-
if parsed.startswith("gs://"):
|
|
626
|
-
parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
|
|
627
|
-
blob = bucket.blob(parsed)
|
|
628
|
-
data = blob.download_as_bytes()
|
|
564
|
+
inp = self.io.new_input(fp)
|
|
565
|
+
with inp.open() as f:
|
|
566
|
+
data = f.read()
|
|
629
567
|
|
|
630
568
|
if data:
|
|
569
|
+
# Compute statistics using a single read of the compressed bytes
|
|
631
570
|
file_size = len(data)
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
for ci in range(ncols):
|
|
650
|
-
try:
|
|
651
|
-
col = table.column(ci)
|
|
652
|
-
# combine chunks if needed
|
|
653
|
-
if hasattr(col, "combine_chunks"):
|
|
654
|
-
arr = col.combine_chunks()
|
|
655
|
-
else:
|
|
656
|
-
arr = col
|
|
657
|
-
vec = draken.Vector.from_arrow(arr)
|
|
658
|
-
mapped = list(vec.compress())
|
|
659
|
-
non_nulls = [m for m in mapped if m != NULL_FLAG]
|
|
660
|
-
if non_nulls:
|
|
661
|
-
mins[ci] = int(min(non_nulls))
|
|
662
|
-
maxs[ci] = int(max(non_nulls))
|
|
663
|
-
else:
|
|
664
|
-
mins[ci] = None
|
|
665
|
-
maxs[ci] = None
|
|
666
|
-
except Exception:
|
|
667
|
-
# per-column fallback: leave None
|
|
668
|
-
mins[ci] = None
|
|
669
|
-
maxs[ci] = None
|
|
670
|
-
except Exception:
|
|
671
|
-
# Draken not available; fall back to Parquet footer stats
|
|
672
|
-
ncols = pf.metadata.num_columns
|
|
673
|
-
mins = [None] * ncols
|
|
674
|
-
maxs = [None] * ncols
|
|
675
|
-
for rg in range(pf.num_row_groups):
|
|
676
|
-
for ci in range(ncols):
|
|
677
|
-
col_meta = pf.metadata.row_group(rg).column(ci)
|
|
678
|
-
stats = getattr(col_meta, "statistics", None)
|
|
679
|
-
if not stats:
|
|
680
|
-
continue
|
|
681
|
-
smin = getattr(stats, "min", None)
|
|
682
|
-
smax = getattr(stats, "max", None)
|
|
683
|
-
if smin is None and smax is None:
|
|
684
|
-
continue
|
|
685
|
-
|
|
686
|
-
def _to_py(v):
|
|
687
|
-
try:
|
|
688
|
-
return int(v)
|
|
689
|
-
except Exception:
|
|
690
|
-
try:
|
|
691
|
-
return float(v)
|
|
692
|
-
except Exception:
|
|
693
|
-
try:
|
|
694
|
-
if isinstance(v, (bytes, bytearray)):
|
|
695
|
-
return v.decode("utf-8", errors="ignore")
|
|
696
|
-
except Exception:
|
|
697
|
-
pass
|
|
698
|
-
return v
|
|
699
|
-
|
|
700
|
-
if smin is not None:
|
|
701
|
-
sval = _to_py(smin)
|
|
702
|
-
if mins[ci] is None:
|
|
703
|
-
mins[ci] = sval
|
|
704
|
-
else:
|
|
705
|
-
try:
|
|
706
|
-
if sval < mins[ci]:
|
|
707
|
-
mins[ci] = sval
|
|
708
|
-
except Exception:
|
|
709
|
-
pass
|
|
710
|
-
if smax is not None:
|
|
711
|
-
sval = _to_py(smax)
|
|
712
|
-
if maxs[ci] is None:
|
|
713
|
-
maxs[ci] = sval
|
|
714
|
-
else:
|
|
715
|
-
try:
|
|
716
|
-
if sval > maxs[ci]:
|
|
717
|
-
maxs[ci] = sval
|
|
718
|
-
except Exception:
|
|
719
|
-
pass
|
|
720
|
-
|
|
721
|
-
# normalize lists to empty lists when values missing
|
|
722
|
-
min_values = [m for m in mins if m is not None]
|
|
723
|
-
max_values = [m for m in maxs if m is not None]
|
|
571
|
+
manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
|
|
572
|
+
else:
|
|
573
|
+
# Empty file, create placeholder entry
|
|
574
|
+
manifest_entry = ParquetManifestEntry(
|
|
575
|
+
file_path=fp,
|
|
576
|
+
file_format="parquet",
|
|
577
|
+
record_count=0,
|
|
578
|
+
null_counts=[],
|
|
579
|
+
file_size_in_bytes=0,
|
|
580
|
+
uncompressed_size_in_bytes=0,
|
|
581
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
582
|
+
min_k_hashes=[],
|
|
583
|
+
histogram_counts=[],
|
|
584
|
+
histogram_bins=0,
|
|
585
|
+
min_values=[],
|
|
586
|
+
max_values=[],
|
|
587
|
+
)
|
|
724
588
|
except Exception:
|
|
725
|
-
# If
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
"max_values": max_values,
|
|
742
|
-
}
|
|
743
|
-
)
|
|
589
|
+
# If read fails, fall back to placeholders
|
|
590
|
+
manifest_entry = ParquetManifestEntry(
|
|
591
|
+
file_path=fp,
|
|
592
|
+
file_format="parquet",
|
|
593
|
+
record_count=0,
|
|
594
|
+
null_counts=[],
|
|
595
|
+
file_size_in_bytes=0,
|
|
596
|
+
uncompressed_size_in_bytes=0,
|
|
597
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
598
|
+
min_k_hashes=[],
|
|
599
|
+
histogram_counts=[],
|
|
600
|
+
histogram_bins=0,
|
|
601
|
+
min_values=[],
|
|
602
|
+
max_values=[],
|
|
603
|
+
)
|
|
604
|
+
new_entries.append(manifest_entry.to_dict())
|
|
744
605
|
|
|
745
606
|
merged_entries = prev_entries + new_entries
|
|
746
607
|
|
|
@@ -754,41 +615,44 @@ class SimpleDataset(Dataset):
|
|
|
754
615
|
# Build summary deltas
|
|
755
616
|
added_data_files = len(new_entries)
|
|
756
617
|
added_files_size = 0
|
|
618
|
+
added_data_size = 0
|
|
757
619
|
added_records = 0
|
|
620
|
+
# Sum statistics from new entries
|
|
621
|
+
for entry in new_entries:
|
|
622
|
+
added_data_size += entry.get("uncompressed_size_in_bytes", 0)
|
|
623
|
+
added_records += entry.get("record_count", 0)
|
|
758
624
|
deleted_data_files = 0
|
|
759
625
|
deleted_files_size = 0
|
|
626
|
+
deleted_data_size = 0
|
|
760
627
|
deleted_records = 0
|
|
761
628
|
|
|
629
|
+
prev_total_data_size = (
|
|
630
|
+
int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
|
|
631
|
+
)
|
|
632
|
+
|
|
762
633
|
total_data_files = prev_total_files + added_data_files - deleted_data_files
|
|
763
634
|
total_files_size = prev_total_size + added_files_size - deleted_files_size
|
|
635
|
+
total_data_size = prev_total_data_size + added_data_size - deleted_data_size
|
|
764
636
|
total_records = prev_total_records + added_records - deleted_records
|
|
765
637
|
|
|
766
638
|
summary = {
|
|
767
639
|
"added-data-files": added_data_files,
|
|
768
640
|
"added-files-size": added_files_size,
|
|
641
|
+
"added-data-size": added_data_size,
|
|
769
642
|
"added-records": added_records,
|
|
770
643
|
"deleted-data-files": deleted_data_files,
|
|
771
644
|
"deleted-files-size": deleted_files_size,
|
|
645
|
+
"deleted-data-size": deleted_data_size,
|
|
772
646
|
"deleted-records": deleted_records,
|
|
773
647
|
"total-data-files": total_data_files,
|
|
774
648
|
"total-files-size": total_files_size,
|
|
649
|
+
"total-data-size": total_data_size,
|
|
775
650
|
"total-records": total_records,
|
|
776
651
|
}
|
|
777
652
|
|
|
778
653
|
# Sequence number
|
|
779
654
|
try:
|
|
780
|
-
|
|
781
|
-
for s in self.metadata.snapshots:
|
|
782
|
-
seq = getattr(s, "sequence_number", None)
|
|
783
|
-
if seq is None:
|
|
784
|
-
continue
|
|
785
|
-
try:
|
|
786
|
-
ival = int(seq)
|
|
787
|
-
except Exception:
|
|
788
|
-
continue
|
|
789
|
-
if ival > max_seq:
|
|
790
|
-
max_seq = ival
|
|
791
|
-
next_seq = max_seq + 1
|
|
655
|
+
next_seq = self._next_sequence_number()
|
|
792
656
|
except Exception:
|
|
793
657
|
next_seq = 1
|
|
794
658
|
|
|
@@ -853,7 +717,7 @@ class SimpleDataset(Dataset):
|
|
|
853
717
|
prev_total_records = 0
|
|
854
718
|
|
|
855
719
|
# Build unique new entries (ignore duplicates in input). Only accept
|
|
856
|
-
# parquet files and
|
|
720
|
+
# parquet files and compute full statistics for each file.
|
|
857
721
|
new_entries = []
|
|
858
722
|
seen = set()
|
|
859
723
|
for fp in files:
|
|
@@ -863,14 +727,7 @@ class SimpleDataset(Dataset):
|
|
|
863
727
|
continue
|
|
864
728
|
seen.add(fp)
|
|
865
729
|
|
|
866
|
-
file_size = 0
|
|
867
|
-
record_count = 0
|
|
868
|
-
min_values = []
|
|
869
|
-
max_values = []
|
|
870
730
|
try:
|
|
871
|
-
import pyarrow as pa
|
|
872
|
-
import pyarrow.parquet as pq
|
|
873
|
-
|
|
874
731
|
data = None
|
|
875
732
|
if self.io and hasattr(self.io, "new_input"):
|
|
876
733
|
inp = self.io.new_input(fp)
|
|
@@ -890,80 +747,42 @@ class SimpleDataset(Dataset):
|
|
|
890
747
|
data = blob.download_as_bytes()
|
|
891
748
|
|
|
892
749
|
if data:
|
|
750
|
+
# Compute statistics using a single read of the compressed bytes
|
|
893
751
|
file_size = len(data)
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
def _to_py(v):
|
|
912
|
-
try:
|
|
913
|
-
return int(v)
|
|
914
|
-
except Exception:
|
|
915
|
-
try:
|
|
916
|
-
return float(v)
|
|
917
|
-
except Exception:
|
|
918
|
-
try:
|
|
919
|
-
if isinstance(v, (bytes, bytearray)):
|
|
920
|
-
return v.decode("utf-8", errors="ignore")
|
|
921
|
-
except Exception:
|
|
922
|
-
pass
|
|
923
|
-
return v
|
|
924
|
-
|
|
925
|
-
if smin is not None:
|
|
926
|
-
sval = _to_py(smin)
|
|
927
|
-
if mins[ci] is None:
|
|
928
|
-
mins[ci] = sval
|
|
929
|
-
else:
|
|
930
|
-
try:
|
|
931
|
-
if sval < mins[ci]:
|
|
932
|
-
mins[ci] = sval
|
|
933
|
-
except Exception:
|
|
934
|
-
pass
|
|
935
|
-
if smax is not None:
|
|
936
|
-
sval = _to_py(smax)
|
|
937
|
-
if maxs[ci] is None:
|
|
938
|
-
maxs[ci] = sval
|
|
939
|
-
else:
|
|
940
|
-
try:
|
|
941
|
-
if sval > maxs[ci]:
|
|
942
|
-
maxs[ci] = sval
|
|
943
|
-
except Exception:
|
|
944
|
-
pass
|
|
945
|
-
|
|
946
|
-
min_values = [m for m in mins if m is not None]
|
|
947
|
-
max_values = [m for m in maxs if m is not None]
|
|
752
|
+
manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
|
|
753
|
+
else:
|
|
754
|
+
# Empty file, create placeholder entry
|
|
755
|
+
manifest_entry = ParquetManifestEntry(
|
|
756
|
+
file_path=fp,
|
|
757
|
+
file_format="parquet",
|
|
758
|
+
record_count=0,
|
|
759
|
+
null_counts=[],
|
|
760
|
+
file_size_in_bytes=0,
|
|
761
|
+
uncompressed_size_in_bytes=0,
|
|
762
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
763
|
+
min_k_hashes=[],
|
|
764
|
+
histogram_counts=[],
|
|
765
|
+
histogram_bins=0,
|
|
766
|
+
min_values=[],
|
|
767
|
+
max_values=[],
|
|
768
|
+
)
|
|
948
769
|
except Exception:
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
}
|
|
966
|
-
)
|
|
770
|
+
# If read fails, create placeholder entry
|
|
771
|
+
manifest_entry = ParquetManifestEntry(
|
|
772
|
+
file_path=fp,
|
|
773
|
+
file_format="parquet",
|
|
774
|
+
record_count=0,
|
|
775
|
+
null_counts=[],
|
|
776
|
+
file_size_in_bytes=0,
|
|
777
|
+
uncompressed_size_in_bytes=0,
|
|
778
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
779
|
+
min_k_hashes=[],
|
|
780
|
+
histogram_counts=[],
|
|
781
|
+
histogram_bins=0,
|
|
782
|
+
min_values=[],
|
|
783
|
+
max_values=[],
|
|
784
|
+
)
|
|
785
|
+
new_entries.append(manifest_entry.to_dict())
|
|
967
786
|
|
|
968
787
|
manifest_path = None
|
|
969
788
|
if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
|
|
@@ -974,42 +793,43 @@ class SimpleDataset(Dataset):
|
|
|
974
793
|
# Build summary: previous entries become deleted
|
|
975
794
|
deleted_data_files = prev_total_files
|
|
976
795
|
deleted_files_size = prev_total_size
|
|
796
|
+
deleted_data_size = (
|
|
797
|
+
int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
|
|
798
|
+
)
|
|
977
799
|
deleted_records = prev_total_records
|
|
978
800
|
|
|
979
801
|
added_data_files = len(new_entries)
|
|
980
802
|
added_files_size = 0
|
|
803
|
+
added_data_size = 0
|
|
981
804
|
added_records = 0
|
|
805
|
+
# Sum statistics from new entries
|
|
806
|
+
for entry in new_entries:
|
|
807
|
+
added_data_size += entry.get("uncompressed_size_in_bytes", 0)
|
|
808
|
+
added_records += entry.get("record_count", 0)
|
|
982
809
|
|
|
983
810
|
total_data_files = added_data_files
|
|
984
811
|
total_files_size = added_files_size
|
|
812
|
+
total_data_size = added_data_size
|
|
985
813
|
total_records = added_records
|
|
986
814
|
|
|
987
815
|
summary = {
|
|
988
816
|
"added-data-files": added_data_files,
|
|
989
817
|
"added-files-size": added_files_size,
|
|
818
|
+
"added-data-size": added_data_size,
|
|
990
819
|
"added-records": added_records,
|
|
991
820
|
"deleted-data-files": deleted_data_files,
|
|
992
821
|
"deleted-files-size": deleted_files_size,
|
|
822
|
+
"deleted-data-size": deleted_data_size,
|
|
993
823
|
"deleted-records": deleted_records,
|
|
994
824
|
"total-data-files": total_data_files,
|
|
995
825
|
"total-files-size": total_files_size,
|
|
826
|
+
"total-data-size": total_data_size,
|
|
996
827
|
"total-records": total_records,
|
|
997
828
|
}
|
|
998
829
|
|
|
999
830
|
# Sequence number
|
|
1000
831
|
try:
|
|
1001
|
-
|
|
1002
|
-
for s in self.metadata.snapshots:
|
|
1003
|
-
seq = getattr(s, "sequence_number", None)
|
|
1004
|
-
if seq is None:
|
|
1005
|
-
continue
|
|
1006
|
-
try:
|
|
1007
|
-
ival = int(seq)
|
|
1008
|
-
except Exception:
|
|
1009
|
-
continue
|
|
1010
|
-
if ival > max_seq:
|
|
1011
|
-
max_seq = ival
|
|
1012
|
-
next_seq = max_seq + 1
|
|
832
|
+
next_seq = self._next_sequence_number()
|
|
1013
833
|
except Exception:
|
|
1014
834
|
next_seq = 1
|
|
1015
835
|
|
|
@@ -1041,14 +861,10 @@ class SimpleDataset(Dataset):
|
|
|
1041
861
|
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
1042
862
|
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
1043
863
|
|
|
1044
|
-
def scan(
|
|
1045
|
-
self, row_filter=None, row_limit=None, snapshot_id: Optional[int] = None
|
|
1046
|
-
) -> Iterable[Datafile]:
|
|
864
|
+
def scan(self, row_filter=None, snapshot_id: Optional[int] = None) -> Iterable[Datafile]:
|
|
1047
865
|
"""Return Datafile objects for the given snapshot.
|
|
1048
866
|
|
|
1049
867
|
- If `snapshot_id` is None, use the current snapshot.
|
|
1050
|
-
- Ignore `row_filter` for now and return all files listed in the
|
|
1051
|
-
snapshot's parquet manifest (if present).
|
|
1052
868
|
"""
|
|
1053
869
|
# Determine snapshot to read using the dataset-level helper which
|
|
1054
870
|
# prefers the in-memory current snapshot and otherwise performs a
|
|
@@ -1065,8 +881,6 @@ class SimpleDataset(Dataset):
|
|
|
1065
881
|
import pyarrow as pa
|
|
1066
882
|
import pyarrow.parquet as pq
|
|
1067
883
|
|
|
1068
|
-
data = None
|
|
1069
|
-
|
|
1070
884
|
inp = self.io.new_input(manifest_path)
|
|
1071
885
|
with inp.open() as f:
|
|
1072
886
|
data = f.read()
|
|
@@ -1076,23 +890,536 @@ class SimpleDataset(Dataset):
|
|
|
1076
890
|
|
|
1077
891
|
table = pq.read_table(pa.BufferReader(data))
|
|
1078
892
|
rows = table.to_pylist()
|
|
1079
|
-
cum_rows = 0
|
|
1080
893
|
for r in rows:
|
|
1081
894
|
yield Datafile(entry=r)
|
|
1082
|
-
try:
|
|
1083
|
-
rc = int(r.get("record_count") or 0)
|
|
1084
|
-
except Exception:
|
|
1085
|
-
rc = 0
|
|
1086
|
-
cum_rows += rc
|
|
1087
|
-
if row_limit is not None and cum_rows >= row_limit:
|
|
1088
|
-
break
|
|
1089
895
|
except FileNotFoundError:
|
|
1090
896
|
return iter(())
|
|
1091
897
|
except Exception:
|
|
1092
898
|
return iter(())
|
|
1093
899
|
|
|
900
|
+
def describe(self, snapshot_id: Optional[int] = None, bins: int = 10) -> dict:
|
|
901
|
+
"""Describe all schema columns for the given snapshot.
|
|
902
|
+
|
|
903
|
+
Returns a dict mapping column name -> statistics (same shape as
|
|
904
|
+
the previous `describe` per-column output).
|
|
905
|
+
"""
|
|
906
|
+
import heapq
|
|
907
|
+
|
|
908
|
+
snap = self.snapshot(snapshot_id)
|
|
909
|
+
if snap is None or not getattr(snap, "manifest_list", None):
|
|
910
|
+
raise ValueError("No manifest available for this dataset/snapshot")
|
|
911
|
+
|
|
912
|
+
manifest_path = snap.manifest_list
|
|
913
|
+
|
|
914
|
+
# Read manifest once
|
|
915
|
+
try:
|
|
916
|
+
import pyarrow as pa
|
|
917
|
+
import pyarrow.parquet as pq
|
|
918
|
+
|
|
919
|
+
inp = self.io.new_input(manifest_path)
|
|
920
|
+
with inp.open() as f:
|
|
921
|
+
data = f.read()
|
|
922
|
+
|
|
923
|
+
if not data:
|
|
924
|
+
raise ValueError("Empty manifest data")
|
|
925
|
+
|
|
926
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
927
|
+
entries = table.to_pylist()
|
|
928
|
+
except Exception:
|
|
929
|
+
raise
|
|
930
|
+
|
|
931
|
+
# Resolve schema and describe all columns
|
|
932
|
+
orso_schema = None
|
|
933
|
+
try:
|
|
934
|
+
orso_schema = self.schema()
|
|
935
|
+
except Exception:
|
|
936
|
+
orso_schema = None
|
|
937
|
+
|
|
938
|
+
if orso_schema is None:
|
|
939
|
+
raise ValueError("Schema unavailable; cannot describe all columns")
|
|
940
|
+
|
|
941
|
+
# Map column name -> index for every schema column
|
|
942
|
+
col_to_idx: dict[str, int] = {c.name: i for i, c in enumerate(orso_schema.columns)}
|
|
943
|
+
|
|
944
|
+
# Initialize accumulators per column
|
|
945
|
+
stats: dict[str, dict] = {}
|
|
946
|
+
for name in col_to_idx:
|
|
947
|
+
stats[name] = {
|
|
948
|
+
"null_count": 0,
|
|
949
|
+
"mins": [],
|
|
950
|
+
"maxs": [],
|
|
951
|
+
"hashes": set(),
|
|
952
|
+
"file_hist_infos": [],
|
|
953
|
+
"min_displays": [],
|
|
954
|
+
"max_displays": [],
|
|
955
|
+
"uncompressed_bytes": 0,
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
total_rows = 0
|
|
959
|
+
|
|
960
|
+
def _decode_minmax(v):
|
|
961
|
+
if v is None:
|
|
962
|
+
return None
|
|
963
|
+
if isinstance(v, (int, float)):
|
|
964
|
+
return v
|
|
965
|
+
# For strings stored as string values (not bytes), return as-is
|
|
966
|
+
if isinstance(v, str):
|
|
967
|
+
# Try to parse as number for backward compatibility
|
|
968
|
+
try:
|
|
969
|
+
return int(v)
|
|
970
|
+
except Exception:
|
|
971
|
+
try:
|
|
972
|
+
return float(v)
|
|
973
|
+
except Exception:
|
|
974
|
+
# Not a number, return the string itself for display
|
|
975
|
+
return v
|
|
976
|
+
try:
|
|
977
|
+
if isinstance(v, (bytes, bytearray, memoryview)):
|
|
978
|
+
b = bytes(v)
|
|
979
|
+
if b and b[-1] == 0xFF:
|
|
980
|
+
b = b[:-1]
|
|
981
|
+
s = b.decode("utf-8")
|
|
982
|
+
try:
|
|
983
|
+
return int(s)
|
|
984
|
+
except Exception:
|
|
985
|
+
try:
|
|
986
|
+
return float(s)
|
|
987
|
+
except Exception:
|
|
988
|
+
# Decoded bytes that aren't numbers, return as string
|
|
989
|
+
return s
|
|
990
|
+
except Exception:
|
|
991
|
+
pass
|
|
992
|
+
return None
|
|
993
|
+
|
|
994
|
+
# Single pass through entries updating per-column accumulators
|
|
995
|
+
for ent in entries:
|
|
996
|
+
if not isinstance(ent, dict):
|
|
997
|
+
continue
|
|
998
|
+
total_rows += int(ent.get("record_count") or 0)
|
|
999
|
+
|
|
1000
|
+
# prefetch lists
|
|
1001
|
+
ncounts = ent.get("null_counts") or []
|
|
1002
|
+
mks = ent.get("min_k_hashes") or []
|
|
1003
|
+
hists = ent.get("histogram_counts") or []
|
|
1004
|
+
mv = ent.get("min_values") or []
|
|
1005
|
+
xv = ent.get("max_values") or []
|
|
1006
|
+
mv_disp = ent.get("min_values_display") or []
|
|
1007
|
+
xv_disp = ent.get("max_values_display") or []
|
|
1008
|
+
col_sizes = ent.get("column_uncompressed_sizes_in_bytes") or []
|
|
1009
|
+
|
|
1010
|
+
for cname, cidx in col_to_idx.items():
|
|
1011
|
+
# nulls
|
|
1012
|
+
try:
|
|
1013
|
+
stats[cname]["null_count"] += int((ncounts or [0])[cidx])
|
|
1014
|
+
except Exception:
|
|
1015
|
+
pass
|
|
1016
|
+
|
|
1017
|
+
# mins/maxs
|
|
1018
|
+
try:
|
|
1019
|
+
raw_min = mv[cidx]
|
|
1020
|
+
except Exception:
|
|
1021
|
+
raw_min = None
|
|
1022
|
+
try:
|
|
1023
|
+
raw_max = xv[cidx]
|
|
1024
|
+
except Exception:
|
|
1025
|
+
raw_max = None
|
|
1026
|
+
dmin = _decode_minmax(raw_min)
|
|
1027
|
+
dmax = _decode_minmax(raw_max)
|
|
1028
|
+
if dmin is not None:
|
|
1029
|
+
stats[cname]["mins"].append(dmin)
|
|
1030
|
+
if dmax is not None:
|
|
1031
|
+
stats[cname]["maxs"].append(dmax)
|
|
1032
|
+
|
|
1033
|
+
# collect textual display values when present
|
|
1034
|
+
try:
|
|
1035
|
+
try:
|
|
1036
|
+
raw_min_disp = mv_disp[cidx]
|
|
1037
|
+
except Exception:
|
|
1038
|
+
raw_min_disp = None
|
|
1039
|
+
try:
|
|
1040
|
+
raw_max_disp = xv_disp[cidx]
|
|
1041
|
+
except Exception:
|
|
1042
|
+
raw_max_disp = None
|
|
1043
|
+
|
|
1044
|
+
def _decode_display(v):
|
|
1045
|
+
if v is None:
|
|
1046
|
+
return None
|
|
1047
|
+
try:
|
|
1048
|
+
if isinstance(v, (bytes, bytearray, memoryview)):
|
|
1049
|
+
b = bytes(v)
|
|
1050
|
+
if b and b[-1] == 0xFF:
|
|
1051
|
+
b = b[:-1]
|
|
1052
|
+
return b.decode("utf-8", errors="replace")
|
|
1053
|
+
if isinstance(v, str):
|
|
1054
|
+
return v
|
|
1055
|
+
except Exception:
|
|
1056
|
+
return None
|
|
1057
|
+
return None
|
|
1058
|
+
|
|
1059
|
+
md = _decode_display(raw_min_disp)
|
|
1060
|
+
xd = _decode_display(raw_max_disp)
|
|
1061
|
+
if md is not None:
|
|
1062
|
+
stats[cname]["min_displays"].append(md)
|
|
1063
|
+
if xd is not None:
|
|
1064
|
+
stats[cname]["max_displays"].append(xd)
|
|
1065
|
+
except Exception:
|
|
1066
|
+
pass
|
|
1067
|
+
|
|
1068
|
+
# min-k hashes
|
|
1069
|
+
try:
|
|
1070
|
+
col_mk = mks[cidx] or []
|
|
1071
|
+
except Exception:
|
|
1072
|
+
col_mk = []
|
|
1073
|
+
for h in col_mk:
|
|
1074
|
+
try:
|
|
1075
|
+
stats[cname]["hashes"].add(int(h))
|
|
1076
|
+
except Exception:
|
|
1077
|
+
pass
|
|
1078
|
+
|
|
1079
|
+
# histograms
|
|
1080
|
+
try:
|
|
1081
|
+
col_hist = hists[cidx]
|
|
1082
|
+
except Exception:
|
|
1083
|
+
col_hist = []
|
|
1084
|
+
if col_hist:
|
|
1085
|
+
try:
|
|
1086
|
+
if dmin is not None and dmax is not None and dmin != dmax:
|
|
1087
|
+
stats[cname]["file_hist_infos"].append(
|
|
1088
|
+
(float(dmin), float(dmax), list(col_hist))
|
|
1089
|
+
)
|
|
1090
|
+
except Exception:
|
|
1091
|
+
pass
|
|
1092
|
+
|
|
1093
|
+
# uncompressed bytes for this column (sum across files)
|
|
1094
|
+
try:
|
|
1095
|
+
stats[cname]["uncompressed_bytes"] += int((col_sizes or [0])[cidx])
|
|
1096
|
+
except Exception:
|
|
1097
|
+
pass
|
|
1098
|
+
|
|
1099
|
+
# Build results per column
|
|
1100
|
+
results: dict[str, dict] = {}
|
|
1101
|
+
for cname, cidx in col_to_idx.items():
|
|
1102
|
+
s = stats[cname]
|
|
1103
|
+
# Handle mixed types: separate strings from numbers
|
|
1104
|
+
mins_filtered = [v for v in s["mins"] if v is not None]
|
|
1105
|
+
maxs_filtered = [v for v in s["maxs"] if v is not None]
|
|
1106
|
+
|
|
1107
|
+
# Group by type: strings vs numbers
|
|
1108
|
+
str_mins = [v for v in mins_filtered if isinstance(v, str)]
|
|
1109
|
+
num_mins = [v for v in mins_filtered if not isinstance(v, str)]
|
|
1110
|
+
str_maxs = [v for v in maxs_filtered if isinstance(v, str)]
|
|
1111
|
+
num_maxs = [v for v in maxs_filtered if not isinstance(v, str)]
|
|
1112
|
+
|
|
1113
|
+
# Use whichever type has values (strings take precedence for text columns)
|
|
1114
|
+
global_min = None
|
|
1115
|
+
global_max = None
|
|
1116
|
+
if str_mins:
|
|
1117
|
+
global_min = min(str_mins)
|
|
1118
|
+
elif num_mins:
|
|
1119
|
+
global_min = min(num_mins)
|
|
1120
|
+
|
|
1121
|
+
if str_maxs:
|
|
1122
|
+
global_max = max(str_maxs)
|
|
1123
|
+
elif num_maxs:
|
|
1124
|
+
global_max = max(num_maxs)
|
|
1125
|
+
|
|
1126
|
+
# kmv approx
|
|
1127
|
+
cardinality = 0
|
|
1128
|
+
cardinality_is_exact = False
|
|
1129
|
+
try:
|
|
1130
|
+
collected = s["hashes"]
|
|
1131
|
+
if collected:
|
|
1132
|
+
smallest = heapq.nsmallest(32, collected)
|
|
1133
|
+
k = len(smallest)
|
|
1134
|
+
if k < 31:
|
|
1135
|
+
cardinality = len(set(smallest))
|
|
1136
|
+
cardinality_is_exact = True
|
|
1137
|
+
else:
|
|
1138
|
+
MAX_HASH = (1 << 64) - 1
|
|
1139
|
+
R = max(smallest)
|
|
1140
|
+
if R == 0:
|
|
1141
|
+
cardinality = len(set(smallest))
|
|
1142
|
+
else:
|
|
1143
|
+
cardinality = int((k - 1) * (MAX_HASH + 1) / (R + 1))
|
|
1144
|
+
except Exception:
|
|
1145
|
+
cardinality = 0
|
|
1146
|
+
|
|
1147
|
+
# distribution via distogram
|
|
1148
|
+
distribution = None
|
|
1149
|
+
if (
|
|
1150
|
+
s["file_hist_infos"]
|
|
1151
|
+
and global_min is not None
|
|
1152
|
+
and global_max is not None
|
|
1153
|
+
and global_max > global_min
|
|
1154
|
+
):
|
|
1155
|
+
try:
|
|
1156
|
+
from opteryx_catalog.maki_nage.distogram import Distogram
|
|
1157
|
+
from opteryx_catalog.maki_nage.distogram import count as _count_dist
|
|
1158
|
+
from opteryx_catalog.maki_nage.distogram import count_up_to as _count_up_to
|
|
1159
|
+
from opteryx_catalog.maki_nage.distogram import merge as _merge_distogram
|
|
1160
|
+
from opteryx_catalog.maki_nage.distogram import update as _update_distogram
|
|
1161
|
+
|
|
1162
|
+
dist_bin_count = max(50, bins * 5)
|
|
1163
|
+
global_d = Distogram(bin_count=dist_bin_count)
|
|
1164
|
+
for fmin, fmax, counts in s["file_hist_infos"]:
|
|
1165
|
+
fbins = len(counts)
|
|
1166
|
+
if fbins <= 0:
|
|
1167
|
+
continue
|
|
1168
|
+
temp = Distogram(bin_count=dist_bin_count)
|
|
1169
|
+
span = float(fmax - fmin) if fmax != fmin else 0.0
|
|
1170
|
+
for bi, cnt in enumerate(counts):
|
|
1171
|
+
if cnt <= 0:
|
|
1172
|
+
continue
|
|
1173
|
+
if span == 0.0:
|
|
1174
|
+
rep = float(fmin)
|
|
1175
|
+
else:
|
|
1176
|
+
rep = fmin + (bi + 0.5) * span / fbins
|
|
1177
|
+
_update_distogram(temp, float(rep), int(cnt))
|
|
1178
|
+
global_d = _merge_distogram(global_d, temp)
|
|
1179
|
+
|
|
1180
|
+
distribution = [0] * bins
|
|
1181
|
+
total = int(_count_dist(global_d) or 0)
|
|
1182
|
+
if total == 0:
|
|
1183
|
+
distribution = [0] * bins
|
|
1184
|
+
else:
|
|
1185
|
+
prev = 0.0
|
|
1186
|
+
gmin = float(global_min)
|
|
1187
|
+
gmax = float(global_max)
|
|
1188
|
+
for i in range(1, bins + 1):
|
|
1189
|
+
edge = gmin + (i / bins) * (gmax - gmin)
|
|
1190
|
+
cum = _count_up_to(global_d, edge) or 0.0
|
|
1191
|
+
distribution[i - 1] = int(round(cum - prev))
|
|
1192
|
+
prev = cum
|
|
1193
|
+
diff = total - sum(distribution)
|
|
1194
|
+
if diff != 0:
|
|
1195
|
+
distribution[-1] += diff
|
|
1196
|
+
except Exception:
|
|
1197
|
+
distribution = [0] * bins
|
|
1198
|
+
gspan = float(global_max - global_min)
|
|
1199
|
+
for fmin, fmax, counts in s["file_hist_infos"]:
|
|
1200
|
+
fbins = len(counts)
|
|
1201
|
+
if fbins <= 0:
|
|
1202
|
+
continue
|
|
1203
|
+
for bi, cnt in enumerate(counts):
|
|
1204
|
+
if cnt <= 0:
|
|
1205
|
+
continue
|
|
1206
|
+
rep = fmin + (bi + 0.5) * (fmax - fmin) / fbins
|
|
1207
|
+
gi = int((rep - global_min) / gspan * bins)
|
|
1208
|
+
if gi < 0:
|
|
1209
|
+
gi = 0
|
|
1210
|
+
if gi >= bins:
|
|
1211
|
+
gi = bins - 1
|
|
1212
|
+
distribution[gi] += int(cnt)
|
|
1213
|
+
|
|
1214
|
+
res = {
|
|
1215
|
+
"dataset": self.identifier,
|
|
1216
|
+
"description": getattr(self.metadata, "description", None),
|
|
1217
|
+
"row_count": total_rows,
|
|
1218
|
+
"column": cname,
|
|
1219
|
+
"min": global_min,
|
|
1220
|
+
"max": global_max,
|
|
1221
|
+
"null_count": s["null_count"],
|
|
1222
|
+
"uncompressed_bytes": s["uncompressed_bytes"],
|
|
1223
|
+
"cardinality": cardinality,
|
|
1224
|
+
"cardinality_is_exact": cardinality_is_exact,
|
|
1225
|
+
"distribution": distribution,
|
|
1226
|
+
}
|
|
1227
|
+
|
|
1228
|
+
# If textual, attempt display prefixes like describe()
|
|
1229
|
+
try:
|
|
1230
|
+
is_text = False
|
|
1231
|
+
if orso_schema is not None:
|
|
1232
|
+
col = orso_schema.columns[cidx]
|
|
1233
|
+
ctype = getattr(col, "type", None)
|
|
1234
|
+
if ctype is not None:
|
|
1235
|
+
sctype = str(ctype).lower()
|
|
1236
|
+
if "char" in sctype or "string" in sctype or "varchar" in sctype:
|
|
1237
|
+
is_text = True
|
|
1238
|
+
except Exception:
|
|
1239
|
+
is_text = False
|
|
1240
|
+
|
|
1241
|
+
if is_text:
|
|
1242
|
+
# Use only textual display values collected from manifests.
|
|
1243
|
+
# Decode bytes and strip truncation marker (0xFF) if present.
|
|
1244
|
+
def _decode_display_raw(v):
|
|
1245
|
+
if v is None:
|
|
1246
|
+
return None
|
|
1247
|
+
try:
|
|
1248
|
+
if isinstance(v, (bytes, bytearray, memoryview)):
|
|
1249
|
+
b = bytes(v)
|
|
1250
|
+
if b and b[-1] == 0xFF:
|
|
1251
|
+
b = b[:-1]
|
|
1252
|
+
s_val = b.decode("utf-8", errors="replace")
|
|
1253
|
+
return s_val[:16]
|
|
1254
|
+
if isinstance(v, str):
|
|
1255
|
+
return v[:16]
|
|
1256
|
+
except Exception:
|
|
1257
|
+
return None
|
|
1258
|
+
return None
|
|
1259
|
+
|
|
1260
|
+
min_disp = None
|
|
1261
|
+
max_disp = None
|
|
1262
|
+
try:
|
|
1263
|
+
if s.get("min_displays"):
|
|
1264
|
+
for v in s.get("min_displays"):
|
|
1265
|
+
dv = _decode_display_raw(v)
|
|
1266
|
+
if dv:
|
|
1267
|
+
min_disp = dv
|
|
1268
|
+
break
|
|
1269
|
+
if s.get("max_displays"):
|
|
1270
|
+
for v in s.get("max_displays"):
|
|
1271
|
+
dv = _decode_display_raw(v)
|
|
1272
|
+
if dv:
|
|
1273
|
+
max_disp = dv
|
|
1274
|
+
break
|
|
1275
|
+
except Exception:
|
|
1276
|
+
min_disp = None
|
|
1277
|
+
max_disp = None
|
|
1278
|
+
|
|
1279
|
+
if min_disp is not None or max_disp is not None:
|
|
1280
|
+
res["min_display"] = min_disp
|
|
1281
|
+
res["max_display"] = max_disp
|
|
1282
|
+
|
|
1283
|
+
results[cname] = res
|
|
1284
|
+
|
|
1285
|
+
return results
|
|
1286
|
+
|
|
1287
|
+
def refresh_manifest(self, agent: str, author: Optional[str] = None) -> Optional[int]:
|
|
1288
|
+
"""Refresh manifest statistics and create a new snapshot.
|
|
1289
|
+
|
|
1290
|
+
- `agent`: identifier for the agent performing the refresh (string)
|
|
1291
|
+
- `author`: optional author to record; if omitted uses current snapshot author
|
|
1292
|
+
|
|
1293
|
+
This recalculates per-file statistics (min/max, record counts, sizes)
|
|
1294
|
+
for every file in the current manifest, writes a new manifest and
|
|
1295
|
+
creates a new snapshot with `user_created=False` and
|
|
1296
|
+
`operation_type='statistics-refresh'`.
|
|
1297
|
+
|
|
1298
|
+
Returns the new `snapshot_id` on success or None on failure.
|
|
1299
|
+
"""
|
|
1300
|
+
prev = self.snapshot(None)
|
|
1301
|
+
if prev is None or not getattr(prev, "manifest_list", None):
|
|
1302
|
+
raise ValueError("No current manifest available to refresh")
|
|
1303
|
+
|
|
1304
|
+
# Use same author/commit-timestamp as previous snapshot unless overridden
|
|
1305
|
+
use_author = author if author is not None else getattr(prev, "author", None)
|
|
1306
|
+
|
|
1307
|
+
snapshot_id = int(time.time() * 1000)
|
|
1308
|
+
|
|
1309
|
+
# Rebuild manifest entries by re-reading each data file
|
|
1310
|
+
entries = []
|
|
1311
|
+
try:
|
|
1312
|
+
# Read previous manifest entries
|
|
1313
|
+
inp = self.io.new_input(prev.manifest_list)
|
|
1314
|
+
with inp.open() as f:
|
|
1315
|
+
prev_data = f.read()
|
|
1316
|
+
import pyarrow as pa
|
|
1317
|
+
import pyarrow.parquet as pq
|
|
1318
|
+
|
|
1319
|
+
# the manifest is a parquet file, read into a pyarrow Table
|
|
1320
|
+
prev_manifest = pq.read_table(pa.BufferReader(prev_data))
|
|
1321
|
+
prev_rows = prev_manifest.to_pylist()
|
|
1322
|
+
except Exception:
|
|
1323
|
+
prev_rows = []
|
|
1324
|
+
|
|
1325
|
+
total_files = 0
|
|
1326
|
+
total_size = 0
|
|
1327
|
+
total_data_size = 0
|
|
1328
|
+
total_records = 0
|
|
1329
|
+
|
|
1330
|
+
for ent in prev_rows:
|
|
1331
|
+
if not isinstance(ent, dict):
|
|
1332
|
+
continue
|
|
1333
|
+
fp = ent.get("file_path")
|
|
1334
|
+
if not fp:
|
|
1335
|
+
continue
|
|
1336
|
+
try:
|
|
1337
|
+
inp = self.io.new_input(fp)
|
|
1338
|
+
with inp.open() as f:
|
|
1339
|
+
data = f.read()
|
|
1340
|
+
# Full statistics including histograms and k-hashes
|
|
1341
|
+
file_size = len(data)
|
|
1342
|
+
manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
|
|
1343
|
+
dent = manifest_entry.to_dict()
|
|
1344
|
+
except Exception:
|
|
1345
|
+
# Fall back to original entry if re-read fails
|
|
1346
|
+
dent = ent
|
|
1347
|
+
|
|
1348
|
+
entries.append(dent)
|
|
1349
|
+
total_files += 1
|
|
1350
|
+
total_size += int(dent.get("file_size_in_bytes") or 0)
|
|
1351
|
+
total_data_size += int(dent.get("uncompressed_size_in_bytes") or 0)
|
|
1352
|
+
total_records += int(dent.get("record_count") or 0)
|
|
1353
|
+
|
|
1354
|
+
# write new manifest
|
|
1355
|
+
manifest_path = self.catalog.write_parquet_manifest(
|
|
1356
|
+
snapshot_id, entries, self.metadata.location
|
|
1357
|
+
)
|
|
1358
|
+
|
|
1359
|
+
# Build summary
|
|
1360
|
+
summary = {
|
|
1361
|
+
"added-data-files": 0,
|
|
1362
|
+
"added-files-size": 0,
|
|
1363
|
+
"added-data-size": 0,
|
|
1364
|
+
"added-records": 0,
|
|
1365
|
+
"deleted-data-files": 0,
|
|
1366
|
+
"deleted-files-size": 0,
|
|
1367
|
+
"deleted-data-size": 0,
|
|
1368
|
+
"deleted-records": 0,
|
|
1369
|
+
"total-data-files": total_files,
|
|
1370
|
+
"total-files-size": total_size,
|
|
1371
|
+
"total-data-size": total_data_size,
|
|
1372
|
+
"total-records": total_records,
|
|
1373
|
+
}
|
|
1374
|
+
|
|
1375
|
+
# sequence number
|
|
1376
|
+
try:
|
|
1377
|
+
next_seq = self._next_sequence_number()
|
|
1378
|
+
except Exception:
|
|
1379
|
+
next_seq = 1
|
|
1380
|
+
|
|
1381
|
+
parent_id = self.metadata.current_snapshot_id
|
|
1382
|
+
|
|
1383
|
+
# Agent committer metadata
|
|
1384
|
+
agent_meta = {
|
|
1385
|
+
"timestamp": int(time.time() * 1000),
|
|
1386
|
+
"action": "statistics-refresh",
|
|
1387
|
+
"agent": agent,
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
snap = Snapshot(
|
|
1391
|
+
snapshot_id=snapshot_id,
|
|
1392
|
+
timestamp_ms=getattr(prev, "timestamp_ms", snapshot_id),
|
|
1393
|
+
author=use_author,
|
|
1394
|
+
sequence_number=next_seq,
|
|
1395
|
+
user_created=False,
|
|
1396
|
+
operation_type="statistics-refresh",
|
|
1397
|
+
parent_snapshot_id=parent_id,
|
|
1398
|
+
manifest_list=manifest_path,
|
|
1399
|
+
schema_id=self.metadata.current_schema_id,
|
|
1400
|
+
commit_message=getattr(prev, "commit_message", "statistics refresh"),
|
|
1401
|
+
summary=summary,
|
|
1402
|
+
)
|
|
1403
|
+
|
|
1404
|
+
# attach agent metadata under summary
|
|
1405
|
+
if snap.summary is None:
|
|
1406
|
+
snap.summary = {}
|
|
1407
|
+
snap.summary["agent-committer"] = agent_meta
|
|
1408
|
+
|
|
1409
|
+
# update in-memory metadata
|
|
1410
|
+
self.metadata.snapshots.append(snap)
|
|
1411
|
+
self.metadata.current_snapshot_id = snapshot_id
|
|
1412
|
+
|
|
1413
|
+
# persist
|
|
1414
|
+
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
1415
|
+
self.catalog.save_snapshot(self.identifier, snap)
|
|
1416
|
+
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
1417
|
+
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
1418
|
+
|
|
1419
|
+
return snapshot_id
|
|
1420
|
+
|
|
1094
1421
|
def truncate(self, author: str = None, commit_message: Optional[str] = None) -> None:
|
|
1095
|
-
"""Delete all data files and manifests for this
|
|
1422
|
+
"""Delete all data files and manifests for this dataset.
|
|
1096
1423
|
|
|
1097
1424
|
This attempts to delete every data file referenced by existing
|
|
1098
1425
|
Parquet manifests and then delete the manifest files themselves.
|
|
@@ -1109,6 +1436,7 @@ class SimpleDataset(Dataset):
|
|
|
1109
1436
|
snaps = list(self.metadata.snapshots)
|
|
1110
1437
|
removed_files = []
|
|
1111
1438
|
removed_total_size = 0
|
|
1439
|
+
removed_data_size = 0
|
|
1112
1440
|
|
|
1113
1441
|
for snap in snaps:
|
|
1114
1442
|
manifest_path = getattr(snap, "manifest_list", None)
|
|
@@ -1118,31 +1446,34 @@ class SimpleDataset(Dataset):
|
|
|
1118
1446
|
# Read manifest via FileIO if available
|
|
1119
1447
|
rows = []
|
|
1120
1448
|
try:
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
rows = table.to_pylist()
|
|
1449
|
+
inp = io.new_input(manifest_path)
|
|
1450
|
+
with inp.open() as f:
|
|
1451
|
+
data = f.read()
|
|
1452
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
1453
|
+
rows = table.to_pylist()
|
|
1127
1454
|
except Exception:
|
|
1128
1455
|
rows = []
|
|
1129
1456
|
|
|
1130
1457
|
for r in rows:
|
|
1131
1458
|
fp = None
|
|
1132
1459
|
fsize = 0
|
|
1460
|
+
data_size = 0
|
|
1133
1461
|
if isinstance(r, dict):
|
|
1134
1462
|
fp = r.get("file_path")
|
|
1135
1463
|
fsize = int(r.get("file_size_in_bytes") or 0)
|
|
1464
|
+
data_size = int(r.get("uncompressed_size_in_bytes") or 0)
|
|
1136
1465
|
if not fp and "data_file" in r and isinstance(r["data_file"], dict):
|
|
1137
1466
|
fp = r["data_file"].get("file_path") or r["data_file"].get("path")
|
|
1138
1467
|
fsize = int(r["data_file"].get("file_size_in_bytes") or 0)
|
|
1468
|
+
data_size = int(r["data_file"].get("uncompressed_size_in_bytes") or 0)
|
|
1139
1469
|
|
|
1140
1470
|
if fp:
|
|
1141
1471
|
removed_files.append(fp)
|
|
1142
1472
|
removed_total_size += fsize
|
|
1473
|
+
removed_data_size += data_size
|
|
1143
1474
|
|
|
1144
1475
|
# Create a new empty Parquet manifest (entries=[]) to represent the
|
|
1145
|
-
# truncated
|
|
1476
|
+
# truncated dataset for the new snapshot. Do not delete objects.
|
|
1146
1477
|
snapshot_id = int(time.time() * 1000)
|
|
1147
1478
|
|
|
1148
1479
|
# Do NOT write an empty Parquet manifest when there are no entries.
|
|
@@ -1157,29 +1488,21 @@ class SimpleDataset(Dataset):
|
|
|
1157
1488
|
summary = {
|
|
1158
1489
|
"added-data-files": 0,
|
|
1159
1490
|
"added-files-size": 0,
|
|
1491
|
+
"added-data-size": 0,
|
|
1160
1492
|
"added-records": 0,
|
|
1161
1493
|
"deleted-data-files": deleted_count,
|
|
1162
1494
|
"deleted-files-size": deleted_size,
|
|
1495
|
+
"deleted-data-size": removed_data_size,
|
|
1163
1496
|
"deleted-records": 0,
|
|
1164
1497
|
"total-data-files": 0,
|
|
1165
1498
|
"total-files-size": 0,
|
|
1499
|
+
"total-data-size": 0,
|
|
1166
1500
|
"total-records": 0,
|
|
1167
1501
|
}
|
|
1168
1502
|
|
|
1169
1503
|
# Sequence number
|
|
1170
1504
|
try:
|
|
1171
|
-
|
|
1172
|
-
for s in self.metadata.snapshots:
|
|
1173
|
-
seq = getattr(s, "sequence_number", None)
|
|
1174
|
-
if seq is None:
|
|
1175
|
-
continue
|
|
1176
|
-
try:
|
|
1177
|
-
ival = int(seq)
|
|
1178
|
-
except Exception:
|
|
1179
|
-
continue
|
|
1180
|
-
if ival > max_seq:
|
|
1181
|
-
max_seq = ival
|
|
1182
|
-
next_seq = max_seq + 1
|
|
1505
|
+
next_seq = self._next_sequence_number()
|
|
1183
1506
|
except Exception:
|
|
1184
1507
|
next_seq = 1
|
|
1185
1508
|
|
|
@@ -1215,7 +1538,4 @@ class SimpleDataset(Dataset):
|
|
|
1215
1538
|
self.metadata.current_snapshot_id = snapshot_id
|
|
1216
1539
|
|
|
1217
1540
|
if self.catalog and hasattr(self.catalog, "save_snapshot"):
|
|
1218
|
-
|
|
1219
|
-
self.catalog.save_snapshot(self.identifier, snap)
|
|
1220
|
-
except Exception:
|
|
1221
|
-
pass
|
|
1541
|
+
self.catalog.save_snapshot(self.identifier, snap)
|