opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,9 @@ from typing import Any
8
8
  from typing import Iterable
9
9
  from typing import Optional
10
10
 
11
+ from .manifest import ParquetManifestEntry
12
+ from .manifest import build_parquet_manifest_entry
13
+ from .manifest import build_parquet_manifest_minmax_entry
11
14
  from .metadata import DatasetMetadata
12
15
  from .metadata import Snapshot
13
16
  from .metastore import Dataset
@@ -69,6 +72,26 @@ class SimpleDataset(Dataset):
69
72
  def metadata(self) -> DatasetMetadata:
70
73
  return self._metadata
71
74
 
75
+ def _next_sequence_number(self) -> int:
76
+ """Calculate the next sequence number.
77
+
78
+ Uses the current snapshot's sequence number + 1. Works efficiently
79
+ with load_history=False since we only need the most recent snapshot,
80
+ not the full history.
81
+
82
+ Returns:
83
+ The next sequence number (current snapshot's sequence + 1, or 1 if no snapshots).
84
+ """
85
+ if not self.metadata.snapshots:
86
+ # No snapshots yet - this is the first one
87
+ return 1
88
+
89
+ # Get the current (most recent) snapshot - should have the highest sequence number
90
+ current = self.snapshot()
91
+ if current:
92
+ seq = getattr(current, "sequence_number", None)
93
+ return int(seq) + 1 if seq is not None else 1
94
+
72
95
  def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Snapshot]:
73
96
  """Return a Snapshot.
74
97
 
@@ -95,20 +118,17 @@ class SimpleDataset(Dataset):
95
118
  if doc.exists:
96
119
  sd = doc.to_dict() or {}
97
120
  snap = Snapshot(
98
- snapshot_id=int(
99
- sd.get("snapshot-id") or sd.get("snapshot_id") or snapshot_id
100
- ),
101
- timestamp_ms=int(sd.get("timestamp-ms") or sd.get("timestamp_ms") or 0),
121
+ snapshot_id=int(sd.get("snapshot-id") or snapshot_id),
122
+ timestamp_ms=int(sd.get("timestamp-ms", 0)),
102
123
  author=sd.get("author"),
103
- sequence_number=sd.get("sequence-number") or sd.get("sequence_number"),
104
- user_created=sd.get("user-created") or sd.get("user_created"),
105
- manifest_list=sd.get("manifest") or sd.get("manifest_list"),
106
- schema_id=sd.get("schema-id") or sd.get("schema_id"),
124
+ sequence_number=sd.get("sequence-number", 0),
125
+ user_created=sd.get("user-created"),
126
+ manifest_list=sd.get("manifest"),
127
+ schema_id=sd.get("schema-id"),
107
128
  summary=sd.get("summary", {}),
108
- operation_type=sd.get("operation-type") or sd.get("operation_type"),
109
- parent_snapshot_id=sd.get("parent-snapshot-id")
110
- or sd.get("parent_snapshot_id"),
111
- commit_message=sd.get("commit-message") or sd.get("commit_message"),
129
+ operation_type=sd.get("operation-type"),
130
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
131
+ commit_message=sd.get("commit-message"),
112
132
  )
113
133
  return snap
114
134
  except Exception:
@@ -227,148 +247,9 @@ class SimpleDataset(Dataset):
227
247
  if not hasattr(table, "schema"):
228
248
  raise TypeError("append() expects a pyarrow.Table-like object")
229
249
 
230
- # Write parquet file with collision-resistant name
231
- fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
232
- data_path = f"{self.metadata.location}/data/{fname}"
233
- buf = pa.BufferOutputStream()
234
- pq.write_table(table, buf, compression="zstd")
235
- pdata = buf.getvalue().to_pybytes()
236
-
237
- out = self.io.new_output(data_path).create()
238
- out.write(pdata)
239
- out.close()
240
-
241
- # Prepare sketches/stats
242
- K = 32
243
- HBINS = 32
244
- min_k_hashes: list[list[int]] = []
245
- histograms: list[list[int]] = []
246
- min_values: list[int] = []
247
- max_values: list[int] = []
248
-
249
- # Use draken for efficient hashing and compression when available.
250
- import heapq
251
-
252
- # canonical NULL flag for missing values
253
- NULL_FLAG = -(1 << 63)
254
-
255
- try:
256
- import opteryx.draken as draken # type: ignore
257
-
258
- num_rows = int(table.num_rows)
259
-
260
- for col_idx, col in enumerate(table.columns):
261
- # hash column values to 64-bit via draken (new cpdef API)
262
- vec = draken.Vector.from_arrow(col)
263
- hashes = list(vec.hash())
264
-
265
- # Decide whether to compute min-k/histogram for this column based
266
- # on field type and, for strings, average length of values.
267
- field_type = table.schema.field(col_idx).type
268
- compute_min_k = False
269
- if (
270
- pa.types.is_integer(field_type)
271
- or pa.types.is_floating(field_type)
272
- or pa.types.is_decimal(field_type)
273
- ):
274
- compute_min_k = True
275
- elif (
276
- pa.types.is_timestamp(field_type)
277
- or pa.types.is_date(field_type)
278
- or pa.types.is_time(field_type)
279
- ):
280
- compute_min_k = True
281
- elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
282
- # compute average length from non-null values; only allow
283
- # min-k/histogram for short strings (avg <= 16)
284
- col_py = None
285
- try:
286
- col_py = col.to_pylist()
287
- except Exception:
288
- col_py = None
289
-
290
- if col_py is not None:
291
- lens = [len(x) for x in col_py if x is not None]
292
- if lens:
293
- avg_len = sum(lens) / len(lens)
294
- if avg_len <= 16:
295
- compute_min_k = True
296
-
297
- # KMV: take K smallest hashes when allowed; otherwise store an
298
- # empty list for this column.
299
- if compute_min_k:
300
- smallest = heapq.nsmallest(K, hashes)
301
- col_min_k = sorted(smallest)
302
- else:
303
- col_min_k = []
304
-
305
- # For histogram decisions follow the same rule as min-k
306
- compute_hist = compute_min_k
307
-
308
- # Use draken.compress() to get canonical int64 per value
309
- mapped = list(vec.compress())
310
- non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
311
- if non_nulls_mapped:
312
- vmin = min(non_nulls_mapped)
313
- vmax = max(non_nulls_mapped)
314
- col_min = int(vmin)
315
- col_max = int(vmax)
316
- if compute_hist:
317
- if vmin == vmax:
318
- col_hist = [0] * HBINS
319
- col_hist[-1] = len(non_nulls_mapped)
320
- else:
321
- col_hist = [0] * HBINS
322
- span = float(vmax - vmin)
323
- for m in non_nulls_mapped:
324
- b = int(((float(m) - float(vmin)) / span) * (HBINS - 1))
325
- if b < 0:
326
- b = 0
327
- if b >= HBINS:
328
- b = HBINS - 1
329
- col_hist[b] += 1
330
- else:
331
- col_hist = [0] * HBINS
332
- else:
333
- # no non-null values; histogram via hash buckets
334
- col_min = NULL_FLAG
335
- col_max = NULL_FLAG
336
- if compute_hist:
337
- col_hist = [0] * HBINS
338
- for h in hashes:
339
- b = (h >> (64 - 5)) & 0x1F
340
- col_hist[b] += 1
341
- else:
342
- col_hist = [0] * HBINS
343
-
344
- min_k_hashes.append(col_min_k)
345
- histograms.append(col_hist)
346
- min_values.append(col_min)
347
- max_values.append(col_max)
348
- except Exception:
349
- # If draken or its dependencies are unavailable, fall back to
350
- # conservative defaults so we can still write the manifest and
351
- # snapshot without failing the append operation.
352
- num_cols = table.num_columns
353
- min_k_hashes = [[] for _ in range(num_cols)]
354
- HBINS = 32
355
- histograms = [[0] * HBINS for _ in range(num_cols)]
356
- min_values = [NULL_FLAG] * num_cols
357
- max_values = [NULL_FLAG] * num_cols
358
-
359
- entries = [
360
- {
361
- "file_path": data_path,
362
- "file_format": "parquet",
363
- "record_count": int(table.num_rows),
364
- "file_size_in_bytes": len(pdata),
365
- "min_k_hashes": min_k_hashes,
366
- "histogram_counts": histograms,
367
- "histogram_bins": HBINS,
368
- "min_values": min_values,
369
- "max_values": max_values,
370
- }
371
- ]
250
+ # Write table and build manifest entry
251
+ manifest_entry = self._write_table_and_build_entry(table)
252
+ entries = [manifest_entry.to_dict()]
372
253
 
373
254
  # persist manifest: for append, merge previous manifest entries
374
255
  # with the new entries so the snapshot's manifest is cumulative.
@@ -384,35 +265,15 @@ class SimpleDataset(Dataset):
384
265
  prev_manifest_path = prev_snap.manifest_list
385
266
  try:
386
267
  # Prefer FileIO when available
387
- if self.io and hasattr(self.io, "new_input"):
388
- inp = self.io.new_input(prev_manifest_path)
389
- with inp.open() as f:
390
- prev_data = f.read()
391
- import pyarrow as pa
392
- import pyarrow.parquet as pq
393
-
394
- prev_table = pq.read_table(pa.BufferReader(prev_data))
395
- prev_rows = prev_table.to_pylist()
396
- merged_entries = prev_rows + merged_entries
397
- else:
398
- # Fall back to catalog storage client (GCS)
399
- if (
400
- self.catalog
401
- and getattr(self.catalog, "_storage_client", None)
402
- and getattr(self.catalog, "gcs_bucket", None)
403
- ):
404
- bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
405
- parsed = prev_manifest_path
406
- if parsed.startswith("gs://"):
407
- parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
408
- blob = bucket.blob(parsed)
409
- prev_data = blob.download_as_bytes()
410
- import pyarrow as pa
411
- import pyarrow.parquet as pq
412
-
413
- prev_table = pq.read_table(pa.BufferReader(prev_data))
414
- prev_rows = prev_table.to_pylist()
415
- merged_entries = prev_rows + merged_entries
268
+ inp = self.io.new_input(prev_manifest_path)
269
+ with inp.open() as f:
270
+ prev_data = f.read()
271
+ import pyarrow as pa
272
+ import pyarrow.parquet as pq
273
+
274
+ prev_table = pq.read_table(pa.BufferReader(prev_data))
275
+ prev_rows = prev_table.to_pylist()
276
+ merged_entries = prev_rows + merged_entries
416
277
  except Exception:
417
278
  # If we can't read the previous manifest, continue with
418
279
  # just the new entries (don't fail the append).
@@ -433,63 +294,52 @@ class SimpleDataset(Dataset):
433
294
  commit_message = f"commit by {author}"
434
295
 
435
296
  recs = int(table.num_rows)
436
- fsize = len(pdata)
297
+ fsize = int(getattr(manifest_entry, "file_size_in_bytes", 0))
298
+ # Calculate uncompressed size from the manifest entry
299
+ added_data_size = manifest_entry.uncompressed_size_in_bytes
437
300
  added_data_files = 1
438
301
  added_files_size = fsize
439
302
  added_records = recs
440
303
  deleted_data_files = 0
441
304
  deleted_files_size = 0
305
+ deleted_data_size = 0
442
306
  deleted_records = 0
443
307
 
444
308
  prev = self.snapshot()
445
309
  if prev and prev.summary:
446
- try:
447
- prev_total_files = int(prev.summary.get("total-data-files", 0))
448
- except Exception:
449
- prev_total_files = 0
450
- try:
451
- prev_total_size = int(prev.summary.get("total-files-size", 0))
452
- except Exception:
453
- prev_total_size = 0
454
- try:
455
- prev_total_records = int(prev.summary.get("total-records", 0))
456
- except Exception:
457
- prev_total_records = 0
310
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
311
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
312
+ prev_total_data_size = int(prev.summary.get("total-data-size", 0))
313
+ prev_total_records = int(prev.summary.get("total-records", 0))
458
314
  else:
459
315
  prev_total_files = 0
460
316
  prev_total_size = 0
317
+ prev_total_data_size = 0
461
318
  prev_total_records = 0
462
319
 
463
320
  total_data_files = prev_total_files + added_data_files - deleted_data_files
464
321
  total_files_size = prev_total_size + added_files_size - deleted_files_size
322
+ total_data_size = prev_total_data_size + added_data_size - deleted_data_size
465
323
  total_records = prev_total_records + added_records - deleted_records
466
324
 
467
325
  summary = {
468
326
  "added-data-files": added_data_files,
469
327
  "added-files-size": added_files_size,
328
+ "added-data-size": added_data_size,
470
329
  "added-records": added_records,
471
330
  "deleted-data-files": deleted_data_files,
472
331
  "deleted-files-size": deleted_files_size,
332
+ "deleted-data-size": deleted_data_size,
473
333
  "deleted-records": deleted_records,
474
334
  "total-data-files": total_data_files,
475
335
  "total-files-size": total_files_size,
336
+ "total-data-size": total_data_size,
476
337
  "total-records": total_records,
477
338
  }
478
339
 
479
340
  # sequence number
480
341
  try:
481
- max_seq = 0
482
- for s in self.metadata.snapshots:
483
- seq = getattr(s, "sequence_number", None)
484
- if seq is None:
485
- continue
486
- try:
487
- ival = int(seq)
488
- except Exception:
489
- continue
490
- if ival > max_seq:
491
- max_seq = ival
492
- next_seq = max_seq + 1
342
+ next_seq = self._next_sequence_number()
493
343
  except Exception:
494
344
  next_seq = 1
495
345
 
@@ -518,6 +368,136 @@ class SimpleDataset(Dataset):
518
368
  if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
519
369
  self.catalog.save_dataset_metadata(self.identifier, self.metadata)
520
370
 
371
+ def _write_table_and_build_entry(self, table: Any):
372
+ """Write a PyArrow table to storage and return a ParquetManifestEntry.
373
+
374
+ This centralizes the IO and manifest construction so other operations
375
+ (e.g. `overwrite`) can reuse the same behavior as `append`.
376
+ """
377
+ # Write parquet file with collision-resistant name
378
+ fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
379
+ data_path = f"{self.metadata.location}/data/{fname}"
380
+
381
+ import pyarrow as pa
382
+ import pyarrow.parquet as pq
383
+
384
+ buf = pa.BufferOutputStream()
385
+ pq.write_table(table, buf, compression="zstd")
386
+ pdata = buf.getvalue().to_pybytes()
387
+
388
+ out = self.io.new_output(data_path).create()
389
+ out.write(pdata)
390
+ out.close()
391
+
392
+ # Build manifest entry with statistics
393
+ manifest_entry = build_parquet_manifest_entry(table, data_path, len(pdata))
394
+ return manifest_entry
395
+
396
+ def overwrite(self, table: Any, author: str = None, commit_message: Optional[str] = None):
397
+ """Replace the dataset entirely with `table` in a single snapshot.
398
+
399
+ Semantics:
400
+ - Write the provided table as new data file(s)
401
+ - Create a new parquet manifest that contains only the new entries
402
+ - Create a snapshot that records previous files as deleted and the
403
+ new files as added (logical replace)
404
+ """
405
+ # Similar validation as append
406
+ snapshot_id = int(time.time() * 1000)
407
+
408
+ if not hasattr(table, "schema"):
409
+ raise TypeError("overwrite() expects a pyarrow.Table-like object")
410
+
411
+ if author is None:
412
+ raise ValueError("author must be provided when overwriting a dataset")
413
+
414
+ # Write new data and build manifest entries (single table -> single entry)
415
+ manifest_entry = self._write_table_and_build_entry(table)
416
+ new_entries = [manifest_entry.to_dict()]
417
+
418
+ # Write manifest containing only the new entries
419
+ manifest_path = None
420
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
421
+ manifest_path = self.catalog.write_parquet_manifest(
422
+ snapshot_id, new_entries, self.metadata.location
423
+ )
424
+
425
+ # Compute deltas: previous manifest becomes deleted
426
+ prev = self.snapshot(None)
427
+ prev_total_files = 0
428
+ prev_total_size = 0
429
+ prev_total_data_size = 0
430
+ prev_total_records = 0
431
+ if prev and prev.summary:
432
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
433
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
434
+ prev_total_data_size = int(prev.summary.get("total-data-size", 0))
435
+ prev_total_records = int(prev.summary.get("total-records", 0))
436
+
437
+ deleted_data_files = prev_total_files
438
+ deleted_files_size = prev_total_size
439
+ deleted_data_size = prev_total_data_size
440
+ deleted_records = prev_total_records
441
+
442
+ added_data_files = len(new_entries)
443
+ added_files_size = sum(e.get("file_size_in_bytes", 0) for e in new_entries)
444
+ added_data_size = sum(e.get("uncompressed_size_in_bytes", 0) for e in new_entries)
445
+ added_records = sum(e.get("record_count", 0) for e in new_entries)
446
+
447
+ total_data_files = added_data_files
448
+ total_files_size = added_files_size
449
+ total_data_size = added_data_size
450
+ total_records = added_records
451
+
452
+ summary = {
453
+ "added-data-files": added_data_files,
454
+ "added-files-size": added_files_size,
455
+ "added-data-size": added_data_size,
456
+ "added-records": added_records,
457
+ "deleted-data-files": deleted_data_files,
458
+ "deleted-files-size": deleted_files_size,
459
+ "deleted-data-size": deleted_data_size,
460
+ "deleted-records": deleted_records,
461
+ "total-data-files": total_data_files,
462
+ "total-files-size": total_files_size,
463
+ "total-data-size": total_data_size,
464
+ "total-records": total_records,
465
+ }
466
+
467
+ # sequence number
468
+ try:
469
+ next_seq = self._next_sequence_number()
470
+ except Exception:
471
+ next_seq = 1
472
+
473
+ parent_id = self.metadata.current_snapshot_id
474
+
475
+ if commit_message is None:
476
+ commit_message = f"overwrite by {author}"
477
+
478
+ snap = Snapshot(
479
+ snapshot_id=snapshot_id,
480
+ timestamp_ms=snapshot_id,
481
+ author=author,
482
+ sequence_number=next_seq,
483
+ user_created=True,
484
+ operation_type="overwrite",
485
+ parent_snapshot_id=parent_id,
486
+ manifest_list=manifest_path,
487
+ schema_id=self.metadata.current_schema_id,
488
+ commit_message=commit_message,
489
+ summary=summary,
490
+ )
491
+
492
+ # Replace in-memory snapshots
493
+ self.metadata.snapshots.append(snap)
494
+ self.metadata.current_snapshot_id = snapshot_id
495
+
496
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
497
+ self.catalog.save_snapshot(self.identifier, snap)
498
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
499
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
500
+
521
501
  def add_files(self, files: list[str], author: str = None, commit_message: Optional[str] = None):
522
502
  """Add filenames to the dataset manifest without writing the files.
523
503
 
@@ -540,45 +520,20 @@ class SimpleDataset(Dataset):
540
520
  prev_total_records = 0
541
521
  prev_entries = []
542
522
  if prev and prev.summary:
543
- try:
544
- prev_total_files = int(prev.summary.get("total-data-files", 0))
545
- except Exception:
546
- prev_total_files = 0
547
- try:
548
- prev_total_size = int(prev.summary.get("total-files-size", 0))
549
- except Exception:
550
- prev_total_size = 0
551
- try:
552
- prev_total_records = int(prev.summary.get("total-records", 0))
553
- except Exception:
554
- prev_total_records = 0
555
-
523
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
524
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
525
+ prev_total_records = int(prev.summary.get("total-records", 0))
556
526
  if prev and getattr(prev, "manifest_list", None):
557
527
  # try to read prev manifest entries
558
528
  try:
559
529
  import pyarrow as pa
560
530
  import pyarrow.parquet as pq
561
531
 
562
- if self.io and hasattr(self.io, "new_input"):
563
- inp = self.io.new_input(prev.manifest_list)
564
- with inp.open() as f:
565
- data = f.read()
566
- table = pq.read_table(pa.BufferReader(data))
567
- prev_entries = table.to_pylist()
568
- else:
569
- if (
570
- self.catalog
571
- and getattr(self.catalog, "_storage_client", None)
572
- and getattr(self.catalog, "gcs_bucket", None)
573
- ):
574
- bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
575
- parsed = prev.manifest_list
576
- if parsed.startswith("gs://"):
577
- parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
578
- blob = bucket.blob(parsed)
579
- data = blob.download_as_bytes()
580
- table = pq.read_table(pa.BufferReader(data))
581
- prev_entries = table.to_pylist()
532
+ inp = self.io.new_input(prev.manifest_list)
533
+ with inp.open() as f:
534
+ data = f.read()
535
+ table = pq.read_table(pa.BufferReader(data))
536
+ prev_entries = table.to_pylist()
582
537
  except Exception:
583
538
  prev_entries = []
584
539
 
@@ -601,146 +556,47 @@ class SimpleDataset(Dataset):
601
556
  seen.add(fp)
602
557
 
603
558
  # Attempt to read file bytes and parquet metadata
604
- file_size = 0
605
- record_count = 0
606
- min_values = []
607
- max_values = []
559
+ # Use rugo's metadata reader which is much faster (microseconds per file)
608
560
  try:
609
- import pyarrow as pa
610
- import pyarrow.parquet as pq
611
-
612
- data = None
613
- if self.io and hasattr(self.io, "new_input"):
614
- inp = self.io.new_input(fp)
615
- with inp.open() as f:
616
- data = f.read()
617
- else:
618
- if (
619
- self.catalog
620
- and getattr(self.catalog, "_storage_client", None)
621
- and getattr(self.catalog, "gcs_bucket", None)
622
- ):
623
- bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
624
- parsed = fp
625
- if parsed.startswith("gs://"):
626
- parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
627
- blob = bucket.blob(parsed)
628
- data = blob.download_as_bytes()
561
+ inp = self.io.new_input(fp)
562
+ with inp.open() as f:
563
+ data = f.read()
629
564
 
630
565
  if data:
631
- file_size = len(data)
632
- pf = pq.ParquetFile(pa.BufferReader(data))
633
- record_count = int(pf.metadata.num_rows or 0)
634
-
635
- # Prefer computing min/max via draken.compress() over
636
- # relying on Parquet footer stats which may contain
637
- # heterogenous or non-numeric values. Fall back to
638
- # footer stats only if draken is unavailable.
639
- try:
640
- import opteryx.draken as draken # type: ignore
641
-
642
- table = pq.read_table(pa.BufferReader(data))
643
- ncols = table.num_columns
644
- mins = [None] * ncols
645
- maxs = [None] * ncols
646
-
647
- NULL_FLAG = -(1 << 63)
648
-
649
- for ci in range(ncols):
650
- try:
651
- col = table.column(ci)
652
- # combine chunks if needed
653
- if hasattr(col, "combine_chunks"):
654
- arr = col.combine_chunks()
655
- else:
656
- arr = col
657
- vec = draken.Vector.from_arrow(arr)
658
- mapped = list(vec.compress())
659
- non_nulls = [m for m in mapped if m != NULL_FLAG]
660
- if non_nulls:
661
- mins[ci] = int(min(non_nulls))
662
- maxs[ci] = int(max(non_nulls))
663
- else:
664
- mins[ci] = None
665
- maxs[ci] = None
666
- except Exception:
667
- # per-column fallback: leave None
668
- mins[ci] = None
669
- maxs[ci] = None
670
- except Exception:
671
- # Draken not available; fall back to Parquet footer stats
672
- ncols = pf.metadata.num_columns
673
- mins = [None] * ncols
674
- maxs = [None] * ncols
675
- for rg in range(pf.num_row_groups):
676
- for ci in range(ncols):
677
- col_meta = pf.metadata.row_group(rg).column(ci)
678
- stats = getattr(col_meta, "statistics", None)
679
- if not stats:
680
- continue
681
- smin = getattr(stats, "min", None)
682
- smax = getattr(stats, "max", None)
683
- if smin is None and smax is None:
684
- continue
685
-
686
- def _to_py(v):
687
- try:
688
- return int(v)
689
- except Exception:
690
- try:
691
- return float(v)
692
- except Exception:
693
- try:
694
- if isinstance(v, (bytes, bytearray)):
695
- return v.decode("utf-8", errors="ignore")
696
- except Exception:
697
- pass
698
- return v
699
-
700
- if smin is not None:
701
- sval = _to_py(smin)
702
- if mins[ci] is None:
703
- mins[ci] = sval
704
- else:
705
- try:
706
- if sval < mins[ci]:
707
- mins[ci] = sval
708
- except Exception:
709
- pass
710
- if smax is not None:
711
- sval = _to_py(smax)
712
- if maxs[ci] is None:
713
- maxs[ci] = sval
714
- else:
715
- try:
716
- if sval > maxs[ci]:
717
- maxs[ci] = sval
718
- except Exception:
719
- pass
720
-
721
- # normalize lists to empty lists when values missing
722
- min_values = [m for m in mins if m is not None]
723
- max_values = [m for m in maxs if m is not None]
566
+ manifest_entry = build_parquet_manifest_minmax_entry(data, fp)
567
+ else:
568
+ # Empty file, create placeholder entry
569
+ manifest_entry = ParquetManifestEntry(
570
+ file_path=fp,
571
+ file_format="parquet",
572
+ record_count=0,
573
+ null_counts=[],
574
+ file_size_in_bytes=0,
575
+ uncompressed_size_in_bytes=0,
576
+ column_uncompressed_sizes_in_bytes=[],
577
+ min_k_hashes=[],
578
+ histogram_counts=[],
579
+ histogram_bins=0,
580
+ min_values=[],
581
+ max_values=[],
582
+ )
724
583
  except Exception:
725
584
  # If metadata read fails, fall back to placeholders
726
- file_size = 0
727
- record_count = 0
728
- min_values = []
729
- max_values = []
730
-
731
- new_entries.append(
732
- {
733
- "file_path": fp,
734
- "file_format": "parquet",
735
- "record_count": int(record_count),
736
- "file_size_in_bytes": int(file_size),
737
- "min_k_hashes": [],
738
- "histogram_counts": [],
739
- "histogram_bins": 0,
740
- "min_values": min_values,
741
- "max_values": max_values,
742
- }
743
- )
585
+ manifest_entry = ParquetManifestEntry(
586
+ file_path=fp,
587
+ file_format="parquet",
588
+ record_count=0,
589
+ null_counts=[],
590
+ file_size_in_bytes=0,
591
+ uncompressed_size_in_bytes=0,
592
+ column_uncompressed_sizes_in_bytes=[],
593
+ min_k_hashes=[],
594
+ histogram_counts=[],
595
+ histogram_bins=0,
596
+ min_values=[],
597
+ max_values=[],
598
+ )
599
+ new_entries.append(manifest_entry.to_dict())
744
600
 
745
601
  merged_entries = prev_entries + new_entries
746
602
 
@@ -754,41 +610,43 @@ class SimpleDataset(Dataset):
754
610
  # Build summary deltas
755
611
  added_data_files = len(new_entries)
756
612
  added_files_size = 0
613
+ added_data_size = 0
757
614
  added_records = 0
615
+ # Sum uncompressed sizes from new entries
616
+ for entry in new_entries:
617
+ added_data_size += entry.get("uncompressed_size_in_bytes", 0)
758
618
  deleted_data_files = 0
759
619
  deleted_files_size = 0
620
+ deleted_data_size = 0
760
621
  deleted_records = 0
761
622
 
623
+ prev_total_data_size = (
624
+ int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
625
+ )
626
+
762
627
  total_data_files = prev_total_files + added_data_files - deleted_data_files
763
628
  total_files_size = prev_total_size + added_files_size - deleted_files_size
629
+ total_data_size = prev_total_data_size + added_data_size - deleted_data_size
764
630
  total_records = prev_total_records + added_records - deleted_records
765
631
 
766
632
  summary = {
767
633
  "added-data-files": added_data_files,
768
634
  "added-files-size": added_files_size,
635
+ "added-data-size": added_data_size,
769
636
  "added-records": added_records,
770
637
  "deleted-data-files": deleted_data_files,
771
638
  "deleted-files-size": deleted_files_size,
639
+ "deleted-data-size": deleted_data_size,
772
640
  "deleted-records": deleted_records,
773
641
  "total-data-files": total_data_files,
774
642
  "total-files-size": total_files_size,
643
+ "total-data-size": total_data_size,
775
644
  "total-records": total_records,
776
645
  }
777
646
 
778
647
  # Sequence number
779
648
  try:
780
- max_seq = 0
781
- for s in self.metadata.snapshots:
782
- seq = getattr(s, "sequence_number", None)
783
- if seq is None:
784
- continue
785
- try:
786
- ival = int(seq)
787
- except Exception:
788
- continue
789
- if ival > max_seq:
790
- max_seq = ival
791
- next_seq = max_seq + 1
649
+ next_seq = self._next_sequence_number()
792
650
  except Exception:
793
651
  next_seq = 1
794
652
 
@@ -897,6 +755,7 @@ class SimpleDataset(Dataset):
897
755
  ncols = pf.metadata.num_columns
898
756
  mins = [None] * ncols
899
757
  maxs = [None] * ncols
758
+ null_counts = [0] * ncols
900
759
  for rg in range(pf.num_row_groups):
901
760
  for ci in range(ncols):
902
761
  col_meta = pf.metadata.row_group(rg).column(ci)
@@ -905,7 +764,8 @@ class SimpleDataset(Dataset):
905
764
  continue
906
765
  smin = getattr(stats, "min", None)
907
766
  smax = getattr(stats, "max", None)
908
- if smin is None and smax is None:
767
+ snull_count = getattr(stats, "null_count", None)
768
+ if smin is None and smax is None and snull_count is None:
909
769
  continue
910
770
 
911
771
  def _to_py(v):
@@ -942,6 +802,11 @@ class SimpleDataset(Dataset):
942
802
  maxs[ci] = sval
943
803
  except Exception:
944
804
  pass
805
+ if snull_count is not None:
806
+ try:
807
+ null_counts[ci] += int(snull_count)
808
+ except Exception:
809
+ pass
945
810
 
946
811
  min_values = [m for m in mins if m is not None]
947
812
  max_values = [m for m in maxs if m is not None]
@@ -950,20 +815,23 @@ class SimpleDataset(Dataset):
950
815
  record_count = 0
951
816
  min_values = []
952
817
  max_values = []
953
-
954
- new_entries.append(
955
- {
956
- "file_path": fp,
957
- "file_format": "parquet",
958
- "record_count": int(record_count),
959
- "file_size_in_bytes": int(file_size),
960
- "min_k_hashes": [],
961
- "histogram_counts": [],
962
- "histogram_bins": 0,
963
- "min_values": min_values,
964
- "max_values": max_values,
965
- }
818
+ null_counts = []
819
+
820
+ manifest_entry = ParquetManifestEntry(
821
+ file_path=fp,
822
+ file_format="parquet",
823
+ record_count=int(record_count),
824
+ null_counts=null_counts,
825
+ file_size_in_bytes=int(file_size),
826
+ uncompressed_size_in_bytes=int(file_size), # Use compressed size as estimate
827
+ column_uncompressed_sizes_in_bytes=[],
828
+ min_k_hashes=[],
829
+ histogram_counts=[],
830
+ histogram_bins=0,
831
+ min_values=min_values,
832
+ max_values=max_values,
966
833
  )
834
+ new_entries.append(manifest_entry.to_dict())
967
835
 
968
836
  manifest_path = None
969
837
  if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
@@ -974,42 +842,42 @@ class SimpleDataset(Dataset):
974
842
  # Build summary: previous entries become deleted
975
843
  deleted_data_files = prev_total_files
976
844
  deleted_files_size = prev_total_size
845
+ deleted_data_size = (
846
+ int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
847
+ )
977
848
  deleted_records = prev_total_records
978
849
 
979
850
  added_data_files = len(new_entries)
980
851
  added_files_size = 0
852
+ added_data_size = 0
853
+ # Sum uncompressed sizes from new entries
854
+ for entry in new_entries:
855
+ added_data_size += entry.get("uncompressed_size_in_bytes", 0)
981
856
  added_records = 0
982
857
 
983
858
  total_data_files = added_data_files
984
859
  total_files_size = added_files_size
860
+ total_data_size = added_data_size
985
861
  total_records = added_records
986
862
 
987
863
  summary = {
988
864
  "added-data-files": added_data_files,
989
865
  "added-files-size": added_files_size,
866
+ "added-data-size": added_data_size,
990
867
  "added-records": added_records,
991
868
  "deleted-data-files": deleted_data_files,
992
869
  "deleted-files-size": deleted_files_size,
870
+ "deleted-data-size": deleted_data_size,
993
871
  "deleted-records": deleted_records,
994
872
  "total-data-files": total_data_files,
995
873
  "total-files-size": total_files_size,
874
+ "total-data-size": total_data_size,
996
875
  "total-records": total_records,
997
876
  }
998
877
 
999
878
  # Sequence number
1000
879
  try:
1001
- max_seq = 0
1002
- for s in self.metadata.snapshots:
1003
- seq = getattr(s, "sequence_number", None)
1004
- if seq is None:
1005
- continue
1006
- try:
1007
- ival = int(seq)
1008
- except Exception:
1009
- continue
1010
- if ival > max_seq:
1011
- max_seq = ival
1012
- next_seq = max_seq + 1
880
+ next_seq = self._next_sequence_number()
1013
881
  except Exception:
1014
882
  next_seq = 1
1015
883
 
@@ -1042,13 +910,11 @@ class SimpleDataset(Dataset):
1042
910
  self.catalog.save_dataset_metadata(self.identifier, self.metadata)
1043
911
 
1044
912
  def scan(
1045
- self, row_filter=None, row_limit=None, snapshot_id: Optional[int] = None
913
+ self, row_filter=None, snapshot_id: Optional[int] = None
1046
914
  ) -> Iterable[Datafile]:
1047
915
  """Return Datafile objects for the given snapshot.
1048
916
 
1049
917
  - If `snapshot_id` is None, use the current snapshot.
1050
- - Ignore `row_filter` for now and return all files listed in the
1051
- snapshot's parquet manifest (if present).
1052
918
  """
1053
919
  # Determine snapshot to read using the dataset-level helper which
1054
920
  # prefers the in-memory current snapshot and otherwise performs a
@@ -1065,8 +931,6 @@ class SimpleDataset(Dataset):
1065
931
  import pyarrow as pa
1066
932
  import pyarrow.parquet as pq
1067
933
 
1068
- data = None
1069
-
1070
934
  inp = self.io.new_input(manifest_path)
1071
935
  with inp.open() as f:
1072
936
  data = f.read()
@@ -1076,23 +940,148 @@ class SimpleDataset(Dataset):
1076
940
 
1077
941
  table = pq.read_table(pa.BufferReader(data))
1078
942
  rows = table.to_pylist()
1079
- cum_rows = 0
1080
943
  for r in rows:
1081
944
  yield Datafile(entry=r)
1082
- try:
1083
- rc = int(r.get("record_count") or 0)
1084
- except Exception:
1085
- rc = 0
1086
- cum_rows += rc
1087
- if row_limit is not None and cum_rows >= row_limit:
1088
- break
1089
945
  except FileNotFoundError:
1090
946
  return iter(())
1091
947
  except Exception:
1092
948
  return iter(())
1093
949
 
950
+ def refresh_manifest(self, agent: str, author: Optional[str] = None) -> Optional[int]:
951
+ """Refresh manifest statistics and create a new snapshot.
952
+
953
+ - `agent`: identifier for the agent performing the refresh (string)
954
+ - `author`: optional author to record; if omitted uses current snapshot author
955
+
956
+ This recalculates per-file statistics (min/max, record counts, sizes)
957
+ for every file in the current manifest, writes a new manifest and
958
+ creates a new snapshot with `user_created=False` and
959
+ `operation_type='statistics-refresh'`.
960
+
961
+ Returns the new `snapshot_id` on success or None on failure.
962
+ """
963
+ prev = self.snapshot(None)
964
+ if prev is None or not getattr(prev, "manifest_list", None):
965
+ raise ValueError("No current manifest available to refresh")
966
+
967
+ # Use same author/commit-timestamp as previous snapshot unless overridden
968
+ use_author = author if author is not None else getattr(prev, "author", None)
969
+
970
+ snapshot_id = int(time.time() * 1000)
971
+
972
+ # Rebuild manifest entries by re-reading each data file
973
+ entries = []
974
+ try:
975
+ # Read previous manifest entries
976
+ inp = self.io.new_input(prev.manifest_list)
977
+ with inp.open() as f:
978
+ prev_data = f.read()
979
+ import pyarrow as pa
980
+ import pyarrow.parquet as pq
981
+
982
+ prev_table = pq.read_table(pa.BufferReader(prev_data))
983
+ prev_rows = prev_table.to_pylist()
984
+ except Exception:
985
+ prev_rows = []
986
+
987
+ total_files = 0
988
+ total_size = 0
989
+ total_data_size = 0
990
+ total_records = 0
991
+
992
+ for ent in prev_rows:
993
+ if not isinstance(ent, dict):
994
+ continue
995
+ fp = ent.get("file_path")
996
+ if not fp:
997
+ continue
998
+ try:
999
+ inp = self.io.new_input(fp)
1000
+ with inp.open() as f:
1001
+ data = f.read()
1002
+ # Full statistics including histograms and k-hashes
1003
+ table = pq.read_table(pa.BufferReader(data))
1004
+ manifest_entry = build_parquet_manifest_entry(table, fp, len(data))
1005
+ dent = manifest_entry.to_dict()
1006
+ except Exception:
1007
+ # Fall back to original entry if re-read fails
1008
+ dent = ent
1009
+
1010
+ entries.append(dent)
1011
+ total_files += 1
1012
+ total_size += int(dent.get("file_size_in_bytes") or 0)
1013
+ total_data_size += int(dent.get("uncompressed_size_in_bytes") or 0)
1014
+ total_records += int(dent.get("record_count") or 0)
1015
+
1016
+ # write new manifest
1017
+ manifest_path = self.catalog.write_parquet_manifest(
1018
+ snapshot_id, entries, self.metadata.location
1019
+ )
1020
+
1021
+ # Build summary
1022
+ summary = {
1023
+ "added-data-files": 0,
1024
+ "added-files-size": 0,
1025
+ "added-data-size": 0,
1026
+ "added-records": 0,
1027
+ "deleted-data-files": 0,
1028
+ "deleted-files-size": 0,
1029
+ "deleted-data-size": 0,
1030
+ "deleted-records": 0,
1031
+ "total-data-files": total_files,
1032
+ "total-files-size": total_size,
1033
+ "total-data-size": total_data_size,
1034
+ "total-records": total_records,
1035
+ }
1036
+
1037
+ # sequence number
1038
+ try:
1039
+ next_seq = self._next_sequence_number()
1040
+ except Exception:
1041
+ next_seq = 1
1042
+
1043
+ parent_id = self.metadata.current_snapshot_id
1044
+
1045
+ # Agent committer metadata
1046
+ agent_meta = {
1047
+ "timestamp": int(time.time() * 1000),
1048
+ "action": "statistics-refresh",
1049
+ "agent": agent,
1050
+ }
1051
+
1052
+ snap = Snapshot(
1053
+ snapshot_id=snapshot_id,
1054
+ timestamp_ms=getattr(prev, "timestamp_ms", snapshot_id),
1055
+ author=use_author,
1056
+ sequence_number=next_seq,
1057
+ user_created=False,
1058
+ operation_type="statistics-refresh",
1059
+ parent_snapshot_id=parent_id,
1060
+ manifest_list=manifest_path,
1061
+ schema_id=self.metadata.current_schema_id,
1062
+ commit_message=getattr(prev, "commit_message", "statistics refresh"),
1063
+ summary=summary,
1064
+ )
1065
+
1066
+ # attach agent metadata under summary
1067
+ if snap.summary is None:
1068
+ snap.summary = {}
1069
+ snap.summary["agent-committer"] = agent_meta
1070
+
1071
+ # update in-memory metadata
1072
+ self.metadata.snapshots.append(snap)
1073
+ self.metadata.current_snapshot_id = snapshot_id
1074
+
1075
+ # persist
1076
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
1077
+ self.catalog.save_snapshot(self.identifier, snap)
1078
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
1079
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
1080
+
1081
+ return snapshot_id
1082
+
1094
1083
  def truncate(self, author: str = None, commit_message: Optional[str] = None) -> None:
1095
- """Delete all data files and manifests for this table.
1084
+ """Delete all data files and manifests for this dataset.
1096
1085
 
1097
1086
  This attempts to delete every data file referenced by existing
1098
1087
  Parquet manifests and then delete the manifest files themselves.
@@ -1109,6 +1098,7 @@ class SimpleDataset(Dataset):
1109
1098
  snaps = list(self.metadata.snapshots)
1110
1099
  removed_files = []
1111
1100
  removed_total_size = 0
1101
+ removed_data_size = 0
1112
1102
 
1113
1103
  for snap in snaps:
1114
1104
  manifest_path = getattr(snap, "manifest_list", None)
@@ -1118,31 +1108,34 @@ class SimpleDataset(Dataset):
1118
1108
  # Read manifest via FileIO if available
1119
1109
  rows = []
1120
1110
  try:
1121
- if hasattr(io, "new_input"):
1122
- inp = io.new_input(manifest_path)
1123
- with inp.open() as f:
1124
- data = f.read()
1125
- table = pq.read_table(pa.BufferReader(data))
1126
- rows = table.to_pylist()
1111
+ inp = io.new_input(manifest_path)
1112
+ with inp.open() as f:
1113
+ data = f.read()
1114
+ table = pq.read_table(pa.BufferReader(data))
1115
+ rows = table.to_pylist()
1127
1116
  except Exception:
1128
1117
  rows = []
1129
1118
 
1130
1119
  for r in rows:
1131
1120
  fp = None
1132
1121
  fsize = 0
1122
+ data_size = 0
1133
1123
  if isinstance(r, dict):
1134
1124
  fp = r.get("file_path")
1135
1125
  fsize = int(r.get("file_size_in_bytes") or 0)
1126
+ data_size = int(r.get("uncompressed_size_in_bytes") or 0)
1136
1127
  if not fp and "data_file" in r and isinstance(r["data_file"], dict):
1137
1128
  fp = r["data_file"].get("file_path") or r["data_file"].get("path")
1138
1129
  fsize = int(r["data_file"].get("file_size_in_bytes") or 0)
1130
+ data_size = int(r["data_file"].get("uncompressed_size_in_bytes") or 0)
1139
1131
 
1140
1132
  if fp:
1141
1133
  removed_files.append(fp)
1142
1134
  removed_total_size += fsize
1135
+ removed_data_size += data_size
1143
1136
 
1144
1137
  # Create a new empty Parquet manifest (entries=[]) to represent the
1145
- # truncated table for the new snapshot. Do not delete objects.
1138
+ # truncated dataset for the new snapshot. Do not delete objects.
1146
1139
  snapshot_id = int(time.time() * 1000)
1147
1140
 
1148
1141
  # Do NOT write an empty Parquet manifest when there are no entries.
@@ -1157,29 +1150,21 @@ class SimpleDataset(Dataset):
1157
1150
  summary = {
1158
1151
  "added-data-files": 0,
1159
1152
  "added-files-size": 0,
1153
+ "added-data-size": 0,
1160
1154
  "added-records": 0,
1161
1155
  "deleted-data-files": deleted_count,
1162
1156
  "deleted-files-size": deleted_size,
1157
+ "deleted-data-size": removed_data_size,
1163
1158
  "deleted-records": 0,
1164
1159
  "total-data-files": 0,
1165
1160
  "total-files-size": 0,
1161
+ "total-data-size": 0,
1166
1162
  "total-records": 0,
1167
1163
  }
1168
1164
 
1169
1165
  # Sequence number
1170
1166
  try:
1171
- max_seq = 0
1172
- for s in self.metadata.snapshots:
1173
- seq = getattr(s, "sequence_number", None)
1174
- if seq is None:
1175
- continue
1176
- try:
1177
- ival = int(seq)
1178
- except Exception:
1179
- continue
1180
- if ival > max_seq:
1181
- max_seq = ival
1182
- next_seq = max_seq + 1
1167
+ next_seq = self._next_sequence_number()
1183
1168
  except Exception:
1184
1169
  next_seq = 1
1185
1170
 
@@ -1215,7 +1200,4 @@ class SimpleDataset(Dataset):
1215
1200
  self.metadata.current_snapshot_id = snapshot_id
1216
1201
 
1217
1202
  if self.catalog and hasattr(self.catalog, "save_snapshot"):
1218
- try:
1219
- self.catalog.save_snapshot(self.identifier, snap)
1220
- except Exception:
1221
- pass
1203
+ self.catalog.save_snapshot(self.identifier, snap)