opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. opteryx_catalog/__init__.py +1 -1
  2. opteryx_catalog/catalog/__init__.py +2 -1
  3. opteryx_catalog/catalog/compaction.py +536 -0
  4. opteryx_catalog/catalog/dataset.py +840 -520
  5. opteryx_catalog/catalog/manifest.py +475 -0
  6. opteryx_catalog/catalog/metadata.py +5 -2
  7. opteryx_catalog/catalog/metastore.py +2 -2
  8. opteryx_catalog/exceptions.py +1 -1
  9. opteryx_catalog/iops/fileio.py +13 -0
  10. opteryx_catalog/iops/gcs.py +35 -5
  11. opteryx_catalog/maki_nage/__init__.py +8 -0
  12. opteryx_catalog/maki_nage/distogram.py +558 -0
  13. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  14. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  15. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  16. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  17. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  18. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  19. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  20. opteryx_catalog/opteryx_catalog.py +296 -242
  21. opteryx_catalog/webhooks/__init__.py +230 -0
  22. opteryx_catalog/webhooks/events.py +177 -0
  23. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  24. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  25. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  26. scripts/collect_byte_counts.py +42 -0
  27. scripts/create_dataset.py +1 -1
  28. scripts/emit_full_single_file.py +81 -0
  29. scripts/inspect_manifest_dryrun.py +322 -0
  30. scripts/inspect_single_file.py +147 -0
  31. scripts/inspect_single_file_gcs.py +124 -0
  32. scripts/read_dataset.py +1 -1
  33. tests/test_collections.py +37 -0
  34. tests/test_compaction.py +233 -0
  35. tests/test_dataset_metadata.py +14 -0
  36. tests/test_describe_uncompressed.py +127 -0
  37. tests/test_refresh_manifest.py +275 -0
  38. tests/test_webhooks.py +177 -0
  39. opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
  40. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  41. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,8 @@ from typing import Any
8
8
  from typing import Iterable
9
9
  from typing import Optional
10
10
 
11
+ from .manifest import ParquetManifestEntry
12
+ from .manifest import build_parquet_manifest_entry_from_bytes
11
13
  from .metadata import DatasetMetadata
12
14
  from .metadata import Snapshot
13
15
  from .metastore import Dataset
@@ -69,6 +71,26 @@ class SimpleDataset(Dataset):
69
71
  def metadata(self) -> DatasetMetadata:
70
72
  return self._metadata
71
73
 
74
+ def _next_sequence_number(self) -> int:
75
+ """Calculate the next sequence number.
76
+
77
+ Uses the current snapshot's sequence number + 1. Works efficiently
78
+ with load_history=False since we only need the most recent snapshot,
79
+ not the full history.
80
+
81
+ Returns:
82
+ The next sequence number (current snapshot's sequence + 1, or 1 if no snapshots).
83
+ """
84
+ if not self.metadata.snapshots:
85
+ # No snapshots yet - this is the first one
86
+ return 1
87
+
88
+ # Get the current (most recent) snapshot - should have the highest sequence number
89
+ current = self.snapshot()
90
+ if current:
91
+ seq = getattr(current, "sequence_number", None)
92
+ return int(seq) + 1 if seq is not None else 1
93
+
72
94
  def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Snapshot]:
73
95
  """Return a Snapshot.
74
96
 
@@ -95,20 +117,17 @@ class SimpleDataset(Dataset):
95
117
  if doc.exists:
96
118
  sd = doc.to_dict() or {}
97
119
  snap = Snapshot(
98
- snapshot_id=int(
99
- sd.get("snapshot-id") or sd.get("snapshot_id") or snapshot_id
100
- ),
101
- timestamp_ms=int(sd.get("timestamp-ms") or sd.get("timestamp_ms") or 0),
120
+ snapshot_id=int(sd.get("snapshot-id") or snapshot_id),
121
+ timestamp_ms=int(sd.get("timestamp-ms", 0)),
102
122
  author=sd.get("author"),
103
- sequence_number=sd.get("sequence-number") or sd.get("sequence_number"),
104
- user_created=sd.get("user-created") or sd.get("user_created"),
105
- manifest_list=sd.get("manifest") or sd.get("manifest_list"),
106
- schema_id=sd.get("schema-id") or sd.get("schema_id"),
123
+ sequence_number=sd.get("sequence-number", 0),
124
+ user_created=sd.get("user-created"),
125
+ manifest_list=sd.get("manifest"),
126
+ schema_id=sd.get("schema-id"),
107
127
  summary=sd.get("summary", {}),
108
- operation_type=sd.get("operation-type") or sd.get("operation_type"),
109
- parent_snapshot_id=sd.get("parent-snapshot-id")
110
- or sd.get("parent_snapshot_id"),
111
- commit_message=sd.get("commit-message") or sd.get("commit_message"),
128
+ operation_type=sd.get("operation-type"),
129
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
130
+ commit_message=sd.get("commit-message"),
112
131
  )
113
132
  return snap
114
133
  except Exception:
@@ -227,148 +246,9 @@ class SimpleDataset(Dataset):
227
246
  if not hasattr(table, "schema"):
228
247
  raise TypeError("append() expects a pyarrow.Table-like object")
229
248
 
230
- # Write parquet file with collision-resistant name
231
- fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
232
- data_path = f"{self.metadata.location}/data/{fname}"
233
- buf = pa.BufferOutputStream()
234
- pq.write_table(table, buf, compression="zstd")
235
- pdata = buf.getvalue().to_pybytes()
236
-
237
- out = self.io.new_output(data_path).create()
238
- out.write(pdata)
239
- out.close()
240
-
241
- # Prepare sketches/stats
242
- K = 32
243
- HBINS = 32
244
- min_k_hashes: list[list[int]] = []
245
- histograms: list[list[int]] = []
246
- min_values: list[int] = []
247
- max_values: list[int] = []
248
-
249
- # Use draken for efficient hashing and compression when available.
250
- import heapq
251
-
252
- # canonical NULL flag for missing values
253
- NULL_FLAG = -(1 << 63)
254
-
255
- try:
256
- import opteryx.draken as draken # type: ignore
257
-
258
- num_rows = int(table.num_rows)
259
-
260
- for col_idx, col in enumerate(table.columns):
261
- # hash column values to 64-bit via draken (new cpdef API)
262
- vec = draken.Vector.from_arrow(col)
263
- hashes = list(vec.hash())
264
-
265
- # Decide whether to compute min-k/histogram for this column based
266
- # on field type and, for strings, average length of values.
267
- field_type = table.schema.field(col_idx).type
268
- compute_min_k = False
269
- if (
270
- pa.types.is_integer(field_type)
271
- or pa.types.is_floating(field_type)
272
- or pa.types.is_decimal(field_type)
273
- ):
274
- compute_min_k = True
275
- elif (
276
- pa.types.is_timestamp(field_type)
277
- or pa.types.is_date(field_type)
278
- or pa.types.is_time(field_type)
279
- ):
280
- compute_min_k = True
281
- elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
282
- # compute average length from non-null values; only allow
283
- # min-k/histogram for short strings (avg <= 16)
284
- col_py = None
285
- try:
286
- col_py = col.to_pylist()
287
- except Exception:
288
- col_py = None
289
-
290
- if col_py is not None:
291
- lens = [len(x) for x in col_py if x is not None]
292
- if lens:
293
- avg_len = sum(lens) / len(lens)
294
- if avg_len <= 16:
295
- compute_min_k = True
296
-
297
- # KMV: take K smallest hashes when allowed; otherwise store an
298
- # empty list for this column.
299
- if compute_min_k:
300
- smallest = heapq.nsmallest(K, hashes)
301
- col_min_k = sorted(smallest)
302
- else:
303
- col_min_k = []
304
-
305
- # For histogram decisions follow the same rule as min-k
306
- compute_hist = compute_min_k
307
-
308
- # Use draken.compress() to get canonical int64 per value
309
- mapped = list(vec.compress())
310
- non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
311
- if non_nulls_mapped:
312
- vmin = min(non_nulls_mapped)
313
- vmax = max(non_nulls_mapped)
314
- col_min = int(vmin)
315
- col_max = int(vmax)
316
- if compute_hist:
317
- if vmin == vmax:
318
- col_hist = [0] * HBINS
319
- col_hist[-1] = len(non_nulls_mapped)
320
- else:
321
- col_hist = [0] * HBINS
322
- span = float(vmax - vmin)
323
- for m in non_nulls_mapped:
324
- b = int(((float(m) - float(vmin)) / span) * (HBINS - 1))
325
- if b < 0:
326
- b = 0
327
- if b >= HBINS:
328
- b = HBINS - 1
329
- col_hist[b] += 1
330
- else:
331
- col_hist = [0] * HBINS
332
- else:
333
- # no non-null values; histogram via hash buckets
334
- col_min = NULL_FLAG
335
- col_max = NULL_FLAG
336
- if compute_hist:
337
- col_hist = [0] * HBINS
338
- for h in hashes:
339
- b = (h >> (64 - 5)) & 0x1F
340
- col_hist[b] += 1
341
- else:
342
- col_hist = [0] * HBINS
343
-
344
- min_k_hashes.append(col_min_k)
345
- histograms.append(col_hist)
346
- min_values.append(col_min)
347
- max_values.append(col_max)
348
- except Exception:
349
- # If draken or its dependencies are unavailable, fall back to
350
- # conservative defaults so we can still write the manifest and
351
- # snapshot without failing the append operation.
352
- num_cols = table.num_columns
353
- min_k_hashes = [[] for _ in range(num_cols)]
354
- HBINS = 32
355
- histograms = [[0] * HBINS for _ in range(num_cols)]
356
- min_values = [NULL_FLAG] * num_cols
357
- max_values = [NULL_FLAG] * num_cols
358
-
359
- entries = [
360
- {
361
- "file_path": data_path,
362
- "file_format": "parquet",
363
- "record_count": int(table.num_rows),
364
- "file_size_in_bytes": len(pdata),
365
- "min_k_hashes": min_k_hashes,
366
- "histogram_counts": histograms,
367
- "histogram_bins": HBINS,
368
- "min_values": min_values,
369
- "max_values": max_values,
370
- }
371
- ]
249
+ # Write table and build manifest entry
250
+ manifest_entry = self._write_table_and_build_entry(table)
251
+ entries = [manifest_entry.to_dict()]
372
252
 
373
253
  # persist manifest: for append, merge previous manifest entries
374
254
  # with the new entries so the snapshot's manifest is cumulative.
@@ -384,35 +264,15 @@ class SimpleDataset(Dataset):
384
264
  prev_manifest_path = prev_snap.manifest_list
385
265
  try:
386
266
  # Prefer FileIO when available
387
- if self.io and hasattr(self.io, "new_input"):
388
- inp = self.io.new_input(prev_manifest_path)
389
- with inp.open() as f:
390
- prev_data = f.read()
391
- import pyarrow as pa
392
- import pyarrow.parquet as pq
393
-
394
- prev_table = pq.read_table(pa.BufferReader(prev_data))
395
- prev_rows = prev_table.to_pylist()
396
- merged_entries = prev_rows + merged_entries
397
- else:
398
- # Fall back to catalog storage client (GCS)
399
- if (
400
- self.catalog
401
- and getattr(self.catalog, "_storage_client", None)
402
- and getattr(self.catalog, "gcs_bucket", None)
403
- ):
404
- bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
405
- parsed = prev_manifest_path
406
- if parsed.startswith("gs://"):
407
- parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
408
- blob = bucket.blob(parsed)
409
- prev_data = blob.download_as_bytes()
410
- import pyarrow as pa
411
- import pyarrow.parquet as pq
412
-
413
- prev_table = pq.read_table(pa.BufferReader(prev_data))
414
- prev_rows = prev_table.to_pylist()
415
- merged_entries = prev_rows + merged_entries
267
+ inp = self.io.new_input(prev_manifest_path)
268
+ with inp.open() as f:
269
+ prev_data = f.read()
270
+ import pyarrow as pa
271
+ import pyarrow.parquet as pq
272
+
273
+ prev_table = pq.read_table(pa.BufferReader(prev_data))
274
+ prev_rows = prev_table.to_pylist()
275
+ merged_entries = prev_rows + merged_entries
416
276
  except Exception:
417
277
  # If we can't read the previous manifest, continue with
418
278
  # just the new entries (don't fail the append).
@@ -433,63 +293,52 @@ class SimpleDataset(Dataset):
433
293
  commit_message = f"commit by {author}"
434
294
 
435
295
  recs = int(table.num_rows)
436
- fsize = len(pdata)
296
+ fsize = int(getattr(manifest_entry, "file_size_in_bytes", 0))
297
+ # Calculate uncompressed size from the manifest entry
298
+ added_data_size = manifest_entry.uncompressed_size_in_bytes
437
299
  added_data_files = 1
438
300
  added_files_size = fsize
439
301
  added_records = recs
440
302
  deleted_data_files = 0
441
303
  deleted_files_size = 0
304
+ deleted_data_size = 0
442
305
  deleted_records = 0
443
306
 
444
307
  prev = self.snapshot()
445
308
  if prev and prev.summary:
446
- try:
447
- prev_total_files = int(prev.summary.get("total-data-files", 0))
448
- except Exception:
449
- prev_total_files = 0
450
- try:
451
- prev_total_size = int(prev.summary.get("total-files-size", 0))
452
- except Exception:
453
- prev_total_size = 0
454
- try:
455
- prev_total_records = int(prev.summary.get("total-records", 0))
456
- except Exception:
457
- prev_total_records = 0
309
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
310
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
311
+ prev_total_data_size = int(prev.summary.get("total-data-size", 0))
312
+ prev_total_records = int(prev.summary.get("total-records", 0))
458
313
  else:
459
314
  prev_total_files = 0
460
315
  prev_total_size = 0
316
+ prev_total_data_size = 0
461
317
  prev_total_records = 0
462
318
 
463
319
  total_data_files = prev_total_files + added_data_files - deleted_data_files
464
320
  total_files_size = prev_total_size + added_files_size - deleted_files_size
321
+ total_data_size = prev_total_data_size + added_data_size - deleted_data_size
465
322
  total_records = prev_total_records + added_records - deleted_records
466
323
 
467
324
  summary = {
468
325
  "added-data-files": added_data_files,
469
326
  "added-files-size": added_files_size,
327
+ "added-data-size": added_data_size,
470
328
  "added-records": added_records,
471
329
  "deleted-data-files": deleted_data_files,
472
330
  "deleted-files-size": deleted_files_size,
331
+ "deleted-data-size": deleted_data_size,
473
332
  "deleted-records": deleted_records,
474
333
  "total-data-files": total_data_files,
475
334
  "total-files-size": total_files_size,
335
+ "total-data-size": total_data_size,
476
336
  "total-records": total_records,
477
337
  }
478
338
 
479
339
  # sequence number
480
340
  try:
481
- max_seq = 0
482
- for s in self.metadata.snapshots:
483
- seq = getattr(s, "sequence_number", None)
484
- if seq is None:
485
- continue
486
- try:
487
- ival = int(seq)
488
- except Exception:
489
- continue
490
- if ival > max_seq:
491
- max_seq = ival
492
- next_seq = max_seq + 1
341
+ next_seq = self._next_sequence_number()
493
342
  except Exception:
494
343
  next_seq = 1
495
344
 
@@ -518,6 +367,140 @@ class SimpleDataset(Dataset):
518
367
  if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
519
368
  self.catalog.save_dataset_metadata(self.identifier, self.metadata)
520
369
 
370
+ def _write_table_and_build_entry(self, table: Any):
371
+ """Write a PyArrow table to storage and return a ParquetManifestEntry.
372
+
373
+ This centralizes the IO and manifest construction so other operations
374
+ (e.g. `overwrite`) can reuse the same behavior as `append`.
375
+ """
376
+ # Write parquet file with collision-resistant name
377
+ fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
378
+ data_path = f"{self.metadata.location}/data/{fname}"
379
+
380
+ import pyarrow as pa
381
+ import pyarrow.parquet as pq
382
+
383
+ from ..iops.fileio import WRITE_PARQUET_OPTIONS
384
+
385
+ buf = pa.BufferOutputStream()
386
+ pq.write_table(table, buf, **WRITE_PARQUET_OPTIONS)
387
+ pdata = buf.getvalue().to_pybytes()
388
+
389
+ out = self.io.new_output(data_path).create()
390
+ out.write(pdata)
391
+ out.close()
392
+
393
+ # Build manifest entry with statistics using a bytes-based, per-column scan
394
+ manifest_entry = build_parquet_manifest_entry_from_bytes(
395
+ pdata, data_path, len(pdata), orig_table=table
396
+ )
397
+ return manifest_entry
398
+
399
+ def overwrite(self, table: Any, author: str = None, commit_message: Optional[str] = None):
400
+ """Replace the dataset entirely with `table` in a single snapshot.
401
+
402
+ Semantics:
403
+ - Write the provided table as new data file(s)
404
+ - Create a new parquet manifest that contains only the new entries
405
+ - Create a snapshot that records previous files as deleted and the
406
+ new files as added (logical replace)
407
+ """
408
+ # Similar validation as append
409
+ snapshot_id = int(time.time() * 1000)
410
+
411
+ if not hasattr(table, "schema"):
412
+ raise TypeError("overwrite() expects a pyarrow.Table-like object")
413
+
414
+ if author is None:
415
+ raise ValueError("author must be provided when overwriting a dataset")
416
+
417
+ # Write new data and build manifest entries (single table -> single entry)
418
+ manifest_entry = self._write_table_and_build_entry(table)
419
+ new_entries = [manifest_entry.to_dict()]
420
+
421
+ # Write manifest containing only the new entries
422
+ manifest_path = None
423
+ if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
424
+ manifest_path = self.catalog.write_parquet_manifest(
425
+ snapshot_id, new_entries, self.metadata.location
426
+ )
427
+
428
+ # Compute deltas: previous manifest becomes deleted
429
+ prev = self.snapshot(None)
430
+ prev_total_files = 0
431
+ prev_total_size = 0
432
+ prev_total_data_size = 0
433
+ prev_total_records = 0
434
+ if prev and prev.summary:
435
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
436
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
437
+ prev_total_data_size = int(prev.summary.get("total-data-size", 0))
438
+ prev_total_records = int(prev.summary.get("total-records", 0))
439
+
440
+ deleted_data_files = prev_total_files
441
+ deleted_files_size = prev_total_size
442
+ deleted_data_size = prev_total_data_size
443
+ deleted_records = prev_total_records
444
+
445
+ added_data_files = len(new_entries)
446
+ added_files_size = sum(e.get("file_size_in_bytes", 0) for e in new_entries)
447
+ added_data_size = sum(e.get("uncompressed_size_in_bytes", 0) for e in new_entries)
448
+ added_records = sum(e.get("record_count", 0) for e in new_entries)
449
+
450
+ total_data_files = added_data_files
451
+ total_files_size = added_files_size
452
+ total_data_size = added_data_size
453
+ total_records = added_records
454
+
455
+ summary = {
456
+ "added-data-files": added_data_files,
457
+ "added-files-size": added_files_size,
458
+ "added-data-size": added_data_size,
459
+ "added-records": added_records,
460
+ "deleted-data-files": deleted_data_files,
461
+ "deleted-files-size": deleted_files_size,
462
+ "deleted-data-size": deleted_data_size,
463
+ "deleted-records": deleted_records,
464
+ "total-data-files": total_data_files,
465
+ "total-files-size": total_files_size,
466
+ "total-data-size": total_data_size,
467
+ "total-records": total_records,
468
+ }
469
+
470
+ # sequence number
471
+ try:
472
+ next_seq = self._next_sequence_number()
473
+ except Exception:
474
+ next_seq = 1
475
+
476
+ parent_id = self.metadata.current_snapshot_id
477
+
478
+ if commit_message is None:
479
+ commit_message = f"overwrite by {author}"
480
+
481
+ snap = Snapshot(
482
+ snapshot_id=snapshot_id,
483
+ timestamp_ms=snapshot_id,
484
+ author=author,
485
+ sequence_number=next_seq,
486
+ user_created=True,
487
+ operation_type="overwrite",
488
+ parent_snapshot_id=parent_id,
489
+ manifest_list=manifest_path,
490
+ schema_id=self.metadata.current_schema_id,
491
+ commit_message=commit_message,
492
+ summary=summary,
493
+ )
494
+
495
+ # Replace in-memory snapshots
496
+ self.metadata.snapshots.append(snap)
497
+ self.metadata.current_snapshot_id = snapshot_id
498
+
499
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
500
+ self.catalog.save_snapshot(self.identifier, snap)
501
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
502
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
503
+
521
504
  def add_files(self, files: list[str], author: str = None, commit_message: Optional[str] = None):
522
505
  """Add filenames to the dataset manifest without writing the files.
523
506
 
@@ -540,45 +523,20 @@ class SimpleDataset(Dataset):
540
523
  prev_total_records = 0
541
524
  prev_entries = []
542
525
  if prev and prev.summary:
543
- try:
544
- prev_total_files = int(prev.summary.get("total-data-files", 0))
545
- except Exception:
546
- prev_total_files = 0
547
- try:
548
- prev_total_size = int(prev.summary.get("total-files-size", 0))
549
- except Exception:
550
- prev_total_size = 0
551
- try:
552
- prev_total_records = int(prev.summary.get("total-records", 0))
553
- except Exception:
554
- prev_total_records = 0
555
-
526
+ prev_total_files = int(prev.summary.get("total-data-files", 0))
527
+ prev_total_size = int(prev.summary.get("total-files-size", 0))
528
+ prev_total_records = int(prev.summary.get("total-records", 0))
556
529
  if prev and getattr(prev, "manifest_list", None):
557
530
  # try to read prev manifest entries
558
531
  try:
559
532
  import pyarrow as pa
560
533
  import pyarrow.parquet as pq
561
534
 
562
- if self.io and hasattr(self.io, "new_input"):
563
- inp = self.io.new_input(prev.manifest_list)
564
- with inp.open() as f:
565
- data = f.read()
566
- table = pq.read_table(pa.BufferReader(data))
567
- prev_entries = table.to_pylist()
568
- else:
569
- if (
570
- self.catalog
571
- and getattr(self.catalog, "_storage_client", None)
572
- and getattr(self.catalog, "gcs_bucket", None)
573
- ):
574
- bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
575
- parsed = prev.manifest_list
576
- if parsed.startswith("gs://"):
577
- parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
578
- blob = bucket.blob(parsed)
579
- data = blob.download_as_bytes()
580
- table = pq.read_table(pa.BufferReader(data))
581
- prev_entries = table.to_pylist()
535
+ inp = self.io.new_input(prev.manifest_list)
536
+ with inp.open() as f:
537
+ data = f.read()
538
+ table = pq.read_table(pa.BufferReader(data))
539
+ prev_entries = table.to_pylist()
582
540
  except Exception:
583
541
  prev_entries = []
584
542
 
@@ -587,9 +545,7 @@ class SimpleDataset(Dataset):
587
545
  }
588
546
 
589
547
  # Build new entries for files that don't already exist. Only accept
590
- # Parquet files and attempt to read lightweight metadata (bytes,
591
- # row count, per-column min/max) from the Parquet footer when
592
- # available.
548
+ # Parquet files and compute full statistics for each file.
593
549
  new_entries = []
594
550
  seen = set()
595
551
  for fp in files:
@@ -600,147 +556,52 @@ class SimpleDataset(Dataset):
600
556
  continue
601
557
  seen.add(fp)
602
558
 
603
- # Attempt to read file bytes and parquet metadata
604
- file_size = 0
605
- record_count = 0
606
- min_values = []
607
- max_values = []
559
+ # Read file and compute full statistics
608
560
  try:
609
561
  import pyarrow as pa
610
562
  import pyarrow.parquet as pq
611
563
 
612
- data = None
613
- if self.io and hasattr(self.io, "new_input"):
614
- inp = self.io.new_input(fp)
615
- with inp.open() as f:
616
- data = f.read()
617
- else:
618
- if (
619
- self.catalog
620
- and getattr(self.catalog, "_storage_client", None)
621
- and getattr(self.catalog, "gcs_bucket", None)
622
- ):
623
- bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
624
- parsed = fp
625
- if parsed.startswith("gs://"):
626
- parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
627
- blob = bucket.blob(parsed)
628
- data = blob.download_as_bytes()
564
+ inp = self.io.new_input(fp)
565
+ with inp.open() as f:
566
+ data = f.read()
629
567
 
630
568
  if data:
569
+ # Compute statistics using a single read of the compressed bytes
631
570
  file_size = len(data)
632
- pf = pq.ParquetFile(pa.BufferReader(data))
633
- record_count = int(pf.metadata.num_rows or 0)
634
-
635
- # Prefer computing min/max via draken.compress() over
636
- # relying on Parquet footer stats which may contain
637
- # heterogenous or non-numeric values. Fall back to
638
- # footer stats only if draken is unavailable.
639
- try:
640
- import opteryx.draken as draken # type: ignore
641
-
642
- table = pq.read_table(pa.BufferReader(data))
643
- ncols = table.num_columns
644
- mins = [None] * ncols
645
- maxs = [None] * ncols
646
-
647
- NULL_FLAG = -(1 << 63)
648
-
649
- for ci in range(ncols):
650
- try:
651
- col = table.column(ci)
652
- # combine chunks if needed
653
- if hasattr(col, "combine_chunks"):
654
- arr = col.combine_chunks()
655
- else:
656
- arr = col
657
- vec = draken.Vector.from_arrow(arr)
658
- mapped = list(vec.compress())
659
- non_nulls = [m for m in mapped if m != NULL_FLAG]
660
- if non_nulls:
661
- mins[ci] = int(min(non_nulls))
662
- maxs[ci] = int(max(non_nulls))
663
- else:
664
- mins[ci] = None
665
- maxs[ci] = None
666
- except Exception:
667
- # per-column fallback: leave None
668
- mins[ci] = None
669
- maxs[ci] = None
670
- except Exception:
671
- # Draken not available; fall back to Parquet footer stats
672
- ncols = pf.metadata.num_columns
673
- mins = [None] * ncols
674
- maxs = [None] * ncols
675
- for rg in range(pf.num_row_groups):
676
- for ci in range(ncols):
677
- col_meta = pf.metadata.row_group(rg).column(ci)
678
- stats = getattr(col_meta, "statistics", None)
679
- if not stats:
680
- continue
681
- smin = getattr(stats, "min", None)
682
- smax = getattr(stats, "max", None)
683
- if smin is None and smax is None:
684
- continue
685
-
686
- def _to_py(v):
687
- try:
688
- return int(v)
689
- except Exception:
690
- try:
691
- return float(v)
692
- except Exception:
693
- try:
694
- if isinstance(v, (bytes, bytearray)):
695
- return v.decode("utf-8", errors="ignore")
696
- except Exception:
697
- pass
698
- return v
699
-
700
- if smin is not None:
701
- sval = _to_py(smin)
702
- if mins[ci] is None:
703
- mins[ci] = sval
704
- else:
705
- try:
706
- if sval < mins[ci]:
707
- mins[ci] = sval
708
- except Exception:
709
- pass
710
- if smax is not None:
711
- sval = _to_py(smax)
712
- if maxs[ci] is None:
713
- maxs[ci] = sval
714
- else:
715
- try:
716
- if sval > maxs[ci]:
717
- maxs[ci] = sval
718
- except Exception:
719
- pass
720
-
721
- # normalize lists to empty lists when values missing
722
- min_values = [m for m in mins if m is not None]
723
- max_values = [m for m in maxs if m is not None]
571
+ manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
572
+ else:
573
+ # Empty file, create placeholder entry
574
+ manifest_entry = ParquetManifestEntry(
575
+ file_path=fp,
576
+ file_format="parquet",
577
+ record_count=0,
578
+ null_counts=[],
579
+ file_size_in_bytes=0,
580
+ uncompressed_size_in_bytes=0,
581
+ column_uncompressed_sizes_in_bytes=[],
582
+ min_k_hashes=[],
583
+ histogram_counts=[],
584
+ histogram_bins=0,
585
+ min_values=[],
586
+ max_values=[],
587
+ )
724
588
  except Exception:
725
- # If metadata read fails, fall back to placeholders
726
- file_size = 0
727
- record_count = 0
728
- min_values = []
729
- max_values = []
730
-
731
- new_entries.append(
732
- {
733
- "file_path": fp,
734
- "file_format": "parquet",
735
- "record_count": int(record_count),
736
- "file_size_in_bytes": int(file_size),
737
- "min_k_hashes": [],
738
- "histogram_counts": [],
739
- "histogram_bins": 0,
740
- "min_values": min_values,
741
- "max_values": max_values,
742
- }
743
- )
589
+ # If read fails, fall back to placeholders
590
+ manifest_entry = ParquetManifestEntry(
591
+ file_path=fp,
592
+ file_format="parquet",
593
+ record_count=0,
594
+ null_counts=[],
595
+ file_size_in_bytes=0,
596
+ uncompressed_size_in_bytes=0,
597
+ column_uncompressed_sizes_in_bytes=[],
598
+ min_k_hashes=[],
599
+ histogram_counts=[],
600
+ histogram_bins=0,
601
+ min_values=[],
602
+ max_values=[],
603
+ )
604
+ new_entries.append(manifest_entry.to_dict())
744
605
 
745
606
  merged_entries = prev_entries + new_entries
746
607
 
@@ -754,41 +615,44 @@ class SimpleDataset(Dataset):
754
615
  # Build summary deltas
755
616
  added_data_files = len(new_entries)
756
617
  added_files_size = 0
618
+ added_data_size = 0
757
619
  added_records = 0
620
+ # Sum statistics from new entries
621
+ for entry in new_entries:
622
+ added_data_size += entry.get("uncompressed_size_in_bytes", 0)
623
+ added_records += entry.get("record_count", 0)
758
624
  deleted_data_files = 0
759
625
  deleted_files_size = 0
626
+ deleted_data_size = 0
760
627
  deleted_records = 0
761
628
 
629
+ prev_total_data_size = (
630
+ int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
631
+ )
632
+
762
633
  total_data_files = prev_total_files + added_data_files - deleted_data_files
763
634
  total_files_size = prev_total_size + added_files_size - deleted_files_size
635
+ total_data_size = prev_total_data_size + added_data_size - deleted_data_size
764
636
  total_records = prev_total_records + added_records - deleted_records
765
637
 
766
638
  summary = {
767
639
  "added-data-files": added_data_files,
768
640
  "added-files-size": added_files_size,
641
+ "added-data-size": added_data_size,
769
642
  "added-records": added_records,
770
643
  "deleted-data-files": deleted_data_files,
771
644
  "deleted-files-size": deleted_files_size,
645
+ "deleted-data-size": deleted_data_size,
772
646
  "deleted-records": deleted_records,
773
647
  "total-data-files": total_data_files,
774
648
  "total-files-size": total_files_size,
649
+ "total-data-size": total_data_size,
775
650
  "total-records": total_records,
776
651
  }
777
652
 
778
653
  # Sequence number
779
654
  try:
780
- max_seq = 0
781
- for s in self.metadata.snapshots:
782
- seq = getattr(s, "sequence_number", None)
783
- if seq is None:
784
- continue
785
- try:
786
- ival = int(seq)
787
- except Exception:
788
- continue
789
- if ival > max_seq:
790
- max_seq = ival
791
- next_seq = max_seq + 1
655
+ next_seq = self._next_sequence_number()
792
656
  except Exception:
793
657
  next_seq = 1
794
658
 
@@ -853,7 +717,7 @@ class SimpleDataset(Dataset):
853
717
  prev_total_records = 0
854
718
 
855
719
  # Build unique new entries (ignore duplicates in input). Only accept
856
- # parquet files and try to read lightweight metadata from each file.
720
+ # parquet files and compute full statistics for each file.
857
721
  new_entries = []
858
722
  seen = set()
859
723
  for fp in files:
@@ -863,14 +727,7 @@ class SimpleDataset(Dataset):
863
727
  continue
864
728
  seen.add(fp)
865
729
 
866
- file_size = 0
867
- record_count = 0
868
- min_values = []
869
- max_values = []
870
730
  try:
871
- import pyarrow as pa
872
- import pyarrow.parquet as pq
873
-
874
731
  data = None
875
732
  if self.io and hasattr(self.io, "new_input"):
876
733
  inp = self.io.new_input(fp)
@@ -890,80 +747,42 @@ class SimpleDataset(Dataset):
890
747
  data = blob.download_as_bytes()
891
748
 
892
749
  if data:
750
+ # Compute statistics using a single read of the compressed bytes
893
751
  file_size = len(data)
894
- pf = pq.ParquetFile(pa.BufferReader(data))
895
- record_count = int(pf.metadata.num_rows or 0)
896
-
897
- ncols = pf.metadata.num_columns
898
- mins = [None] * ncols
899
- maxs = [None] * ncols
900
- for rg in range(pf.num_row_groups):
901
- for ci in range(ncols):
902
- col_meta = pf.metadata.row_group(rg).column(ci)
903
- stats = getattr(col_meta, "statistics", None)
904
- if not stats:
905
- continue
906
- smin = getattr(stats, "min", None)
907
- smax = getattr(stats, "max", None)
908
- if smin is None and smax is None:
909
- continue
910
-
911
- def _to_py(v):
912
- try:
913
- return int(v)
914
- except Exception:
915
- try:
916
- return float(v)
917
- except Exception:
918
- try:
919
- if isinstance(v, (bytes, bytearray)):
920
- return v.decode("utf-8", errors="ignore")
921
- except Exception:
922
- pass
923
- return v
924
-
925
- if smin is not None:
926
- sval = _to_py(smin)
927
- if mins[ci] is None:
928
- mins[ci] = sval
929
- else:
930
- try:
931
- if sval < mins[ci]:
932
- mins[ci] = sval
933
- except Exception:
934
- pass
935
- if smax is not None:
936
- sval = _to_py(smax)
937
- if maxs[ci] is None:
938
- maxs[ci] = sval
939
- else:
940
- try:
941
- if sval > maxs[ci]:
942
- maxs[ci] = sval
943
- except Exception:
944
- pass
945
-
946
- min_values = [m for m in mins if m is not None]
947
- max_values = [m for m in maxs if m is not None]
752
+ manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
753
+ else:
754
+ # Empty file, create placeholder entry
755
+ manifest_entry = ParquetManifestEntry(
756
+ file_path=fp,
757
+ file_format="parquet",
758
+ record_count=0,
759
+ null_counts=[],
760
+ file_size_in_bytes=0,
761
+ uncompressed_size_in_bytes=0,
762
+ column_uncompressed_sizes_in_bytes=[],
763
+ min_k_hashes=[],
764
+ histogram_counts=[],
765
+ histogram_bins=0,
766
+ min_values=[],
767
+ max_values=[],
768
+ )
948
769
  except Exception:
949
- file_size = 0
950
- record_count = 0
951
- min_values = []
952
- max_values = []
953
-
954
- new_entries.append(
955
- {
956
- "file_path": fp,
957
- "file_format": "parquet",
958
- "record_count": int(record_count),
959
- "file_size_in_bytes": int(file_size),
960
- "min_k_hashes": [],
961
- "histogram_counts": [],
962
- "histogram_bins": 0,
963
- "min_values": min_values,
964
- "max_values": max_values,
965
- }
966
- )
770
+ # If read fails, create placeholder entry
771
+ manifest_entry = ParquetManifestEntry(
772
+ file_path=fp,
773
+ file_format="parquet",
774
+ record_count=0,
775
+ null_counts=[],
776
+ file_size_in_bytes=0,
777
+ uncompressed_size_in_bytes=0,
778
+ column_uncompressed_sizes_in_bytes=[],
779
+ min_k_hashes=[],
780
+ histogram_counts=[],
781
+ histogram_bins=0,
782
+ min_values=[],
783
+ max_values=[],
784
+ )
785
+ new_entries.append(manifest_entry.to_dict())
967
786
 
968
787
  manifest_path = None
969
788
  if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
@@ -974,42 +793,43 @@ class SimpleDataset(Dataset):
974
793
  # Build summary: previous entries become deleted
975
794
  deleted_data_files = prev_total_files
976
795
  deleted_files_size = prev_total_size
796
+ deleted_data_size = (
797
+ int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
798
+ )
977
799
  deleted_records = prev_total_records
978
800
 
979
801
  added_data_files = len(new_entries)
980
802
  added_files_size = 0
803
+ added_data_size = 0
981
804
  added_records = 0
805
+ # Sum statistics from new entries
806
+ for entry in new_entries:
807
+ added_data_size += entry.get("uncompressed_size_in_bytes", 0)
808
+ added_records += entry.get("record_count", 0)
982
809
 
983
810
  total_data_files = added_data_files
984
811
  total_files_size = added_files_size
812
+ total_data_size = added_data_size
985
813
  total_records = added_records
986
814
 
987
815
  summary = {
988
816
  "added-data-files": added_data_files,
989
817
  "added-files-size": added_files_size,
818
+ "added-data-size": added_data_size,
990
819
  "added-records": added_records,
991
820
  "deleted-data-files": deleted_data_files,
992
821
  "deleted-files-size": deleted_files_size,
822
+ "deleted-data-size": deleted_data_size,
993
823
  "deleted-records": deleted_records,
994
824
  "total-data-files": total_data_files,
995
825
  "total-files-size": total_files_size,
826
+ "total-data-size": total_data_size,
996
827
  "total-records": total_records,
997
828
  }
998
829
 
999
830
  # Sequence number
1000
831
  try:
1001
- max_seq = 0
1002
- for s in self.metadata.snapshots:
1003
- seq = getattr(s, "sequence_number", None)
1004
- if seq is None:
1005
- continue
1006
- try:
1007
- ival = int(seq)
1008
- except Exception:
1009
- continue
1010
- if ival > max_seq:
1011
- max_seq = ival
1012
- next_seq = max_seq + 1
832
+ next_seq = self._next_sequence_number()
1013
833
  except Exception:
1014
834
  next_seq = 1
1015
835
 
@@ -1041,14 +861,10 @@ class SimpleDataset(Dataset):
1041
861
  if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
1042
862
  self.catalog.save_dataset_metadata(self.identifier, self.metadata)
1043
863
 
1044
- def scan(
1045
- self, row_filter=None, row_limit=None, snapshot_id: Optional[int] = None
1046
- ) -> Iterable[Datafile]:
864
+ def scan(self, row_filter=None, snapshot_id: Optional[int] = None) -> Iterable[Datafile]:
1047
865
  """Return Datafile objects for the given snapshot.
1048
866
 
1049
867
  - If `snapshot_id` is None, use the current snapshot.
1050
- - Ignore `row_filter` for now and return all files listed in the
1051
- snapshot's parquet manifest (if present).
1052
868
  """
1053
869
  # Determine snapshot to read using the dataset-level helper which
1054
870
  # prefers the in-memory current snapshot and otherwise performs a
@@ -1065,8 +881,6 @@ class SimpleDataset(Dataset):
1065
881
  import pyarrow as pa
1066
882
  import pyarrow.parquet as pq
1067
883
 
1068
- data = None
1069
-
1070
884
  inp = self.io.new_input(manifest_path)
1071
885
  with inp.open() as f:
1072
886
  data = f.read()
@@ -1076,23 +890,536 @@ class SimpleDataset(Dataset):
1076
890
 
1077
891
  table = pq.read_table(pa.BufferReader(data))
1078
892
  rows = table.to_pylist()
1079
- cum_rows = 0
1080
893
  for r in rows:
1081
894
  yield Datafile(entry=r)
1082
- try:
1083
- rc = int(r.get("record_count") or 0)
1084
- except Exception:
1085
- rc = 0
1086
- cum_rows += rc
1087
- if row_limit is not None and cum_rows >= row_limit:
1088
- break
1089
895
  except FileNotFoundError:
1090
896
  return iter(())
1091
897
  except Exception:
1092
898
  return iter(())
1093
899
 
900
+ def describe(self, snapshot_id: Optional[int] = None, bins: int = 10) -> dict:
901
+ """Describe all schema columns for the given snapshot.
902
+
903
+ Returns a dict mapping column name -> statistics (same shape as
904
+ the previous `describe` per-column output).
905
+ """
906
+ import heapq
907
+
908
+ snap = self.snapshot(snapshot_id)
909
+ if snap is None or not getattr(snap, "manifest_list", None):
910
+ raise ValueError("No manifest available for this dataset/snapshot")
911
+
912
+ manifest_path = snap.manifest_list
913
+
914
+ # Read manifest once
915
+ try:
916
+ import pyarrow as pa
917
+ import pyarrow.parquet as pq
918
+
919
+ inp = self.io.new_input(manifest_path)
920
+ with inp.open() as f:
921
+ data = f.read()
922
+
923
+ if not data:
924
+ raise ValueError("Empty manifest data")
925
+
926
+ table = pq.read_table(pa.BufferReader(data))
927
+ entries = table.to_pylist()
928
+ except Exception:
929
+ raise
930
+
931
+ # Resolve schema and describe all columns
932
+ orso_schema = None
933
+ try:
934
+ orso_schema = self.schema()
935
+ except Exception:
936
+ orso_schema = None
937
+
938
+ if orso_schema is None:
939
+ raise ValueError("Schema unavailable; cannot describe all columns")
940
+
941
+ # Map column name -> index for every schema column
942
+ col_to_idx: dict[str, int] = {c.name: i for i, c in enumerate(orso_schema.columns)}
943
+
944
+ # Initialize accumulators per column
945
+ stats: dict[str, dict] = {}
946
+ for name in col_to_idx:
947
+ stats[name] = {
948
+ "null_count": 0,
949
+ "mins": [],
950
+ "maxs": [],
951
+ "hashes": set(),
952
+ "file_hist_infos": [],
953
+ "min_displays": [],
954
+ "max_displays": [],
955
+ "uncompressed_bytes": 0,
956
+ }
957
+
958
+ total_rows = 0
959
+
960
+ def _decode_minmax(v):
961
+ if v is None:
962
+ return None
963
+ if isinstance(v, (int, float)):
964
+ return v
965
+ # For strings stored as string values (not bytes), return as-is
966
+ if isinstance(v, str):
967
+ # Try to parse as number for backward compatibility
968
+ try:
969
+ return int(v)
970
+ except Exception:
971
+ try:
972
+ return float(v)
973
+ except Exception:
974
+ # Not a number, return the string itself for display
975
+ return v
976
+ try:
977
+ if isinstance(v, (bytes, bytearray, memoryview)):
978
+ b = bytes(v)
979
+ if b and b[-1] == 0xFF:
980
+ b = b[:-1]
981
+ s = b.decode("utf-8")
982
+ try:
983
+ return int(s)
984
+ except Exception:
985
+ try:
986
+ return float(s)
987
+ except Exception:
988
+ # Decoded bytes that aren't numbers, return as string
989
+ return s
990
+ except Exception:
991
+ pass
992
+ return None
993
+
994
+ # Single pass through entries updating per-column accumulators
995
+ for ent in entries:
996
+ if not isinstance(ent, dict):
997
+ continue
998
+ total_rows += int(ent.get("record_count") or 0)
999
+
1000
+ # prefetch lists
1001
+ ncounts = ent.get("null_counts") or []
1002
+ mks = ent.get("min_k_hashes") or []
1003
+ hists = ent.get("histogram_counts") or []
1004
+ mv = ent.get("min_values") or []
1005
+ xv = ent.get("max_values") or []
1006
+ mv_disp = ent.get("min_values_display") or []
1007
+ xv_disp = ent.get("max_values_display") or []
1008
+ col_sizes = ent.get("column_uncompressed_sizes_in_bytes") or []
1009
+
1010
+ for cname, cidx in col_to_idx.items():
1011
+ # nulls
1012
+ try:
1013
+ stats[cname]["null_count"] += int((ncounts or [0])[cidx])
1014
+ except Exception:
1015
+ pass
1016
+
1017
+ # mins/maxs
1018
+ try:
1019
+ raw_min = mv[cidx]
1020
+ except Exception:
1021
+ raw_min = None
1022
+ try:
1023
+ raw_max = xv[cidx]
1024
+ except Exception:
1025
+ raw_max = None
1026
+ dmin = _decode_minmax(raw_min)
1027
+ dmax = _decode_minmax(raw_max)
1028
+ if dmin is not None:
1029
+ stats[cname]["mins"].append(dmin)
1030
+ if dmax is not None:
1031
+ stats[cname]["maxs"].append(dmax)
1032
+
1033
+ # collect textual display values when present
1034
+ try:
1035
+ try:
1036
+ raw_min_disp = mv_disp[cidx]
1037
+ except Exception:
1038
+ raw_min_disp = None
1039
+ try:
1040
+ raw_max_disp = xv_disp[cidx]
1041
+ except Exception:
1042
+ raw_max_disp = None
1043
+
1044
+ def _decode_display(v):
1045
+ if v is None:
1046
+ return None
1047
+ try:
1048
+ if isinstance(v, (bytes, bytearray, memoryview)):
1049
+ b = bytes(v)
1050
+ if b and b[-1] == 0xFF:
1051
+ b = b[:-1]
1052
+ return b.decode("utf-8", errors="replace")
1053
+ if isinstance(v, str):
1054
+ return v
1055
+ except Exception:
1056
+ return None
1057
+ return None
1058
+
1059
+ md = _decode_display(raw_min_disp)
1060
+ xd = _decode_display(raw_max_disp)
1061
+ if md is not None:
1062
+ stats[cname]["min_displays"].append(md)
1063
+ if xd is not None:
1064
+ stats[cname]["max_displays"].append(xd)
1065
+ except Exception:
1066
+ pass
1067
+
1068
+ # min-k hashes
1069
+ try:
1070
+ col_mk = mks[cidx] or []
1071
+ except Exception:
1072
+ col_mk = []
1073
+ for h in col_mk:
1074
+ try:
1075
+ stats[cname]["hashes"].add(int(h))
1076
+ except Exception:
1077
+ pass
1078
+
1079
+ # histograms
1080
+ try:
1081
+ col_hist = hists[cidx]
1082
+ except Exception:
1083
+ col_hist = []
1084
+ if col_hist:
1085
+ try:
1086
+ if dmin is not None and dmax is not None and dmin != dmax:
1087
+ stats[cname]["file_hist_infos"].append(
1088
+ (float(dmin), float(dmax), list(col_hist))
1089
+ )
1090
+ except Exception:
1091
+ pass
1092
+
1093
+ # uncompressed bytes for this column (sum across files)
1094
+ try:
1095
+ stats[cname]["uncompressed_bytes"] += int((col_sizes or [0])[cidx])
1096
+ except Exception:
1097
+ pass
1098
+
1099
+ # Build results per column
1100
+ results: dict[str, dict] = {}
1101
+ for cname, cidx in col_to_idx.items():
1102
+ s = stats[cname]
1103
+ # Handle mixed types: separate strings from numbers
1104
+ mins_filtered = [v for v in s["mins"] if v is not None]
1105
+ maxs_filtered = [v for v in s["maxs"] if v is not None]
1106
+
1107
+ # Group by type: strings vs numbers
1108
+ str_mins = [v for v in mins_filtered if isinstance(v, str)]
1109
+ num_mins = [v for v in mins_filtered if not isinstance(v, str)]
1110
+ str_maxs = [v for v in maxs_filtered if isinstance(v, str)]
1111
+ num_maxs = [v for v in maxs_filtered if not isinstance(v, str)]
1112
+
1113
+ # Use whichever type has values (strings take precedence for text columns)
1114
+ global_min = None
1115
+ global_max = None
1116
+ if str_mins:
1117
+ global_min = min(str_mins)
1118
+ elif num_mins:
1119
+ global_min = min(num_mins)
1120
+
1121
+ if str_maxs:
1122
+ global_max = max(str_maxs)
1123
+ elif num_maxs:
1124
+ global_max = max(num_maxs)
1125
+
1126
+ # kmv approx
1127
+ cardinality = 0
1128
+ cardinality_is_exact = False
1129
+ try:
1130
+ collected = s["hashes"]
1131
+ if collected:
1132
+ smallest = heapq.nsmallest(32, collected)
1133
+ k = len(smallest)
1134
+ if k < 31:
1135
+ cardinality = len(set(smallest))
1136
+ cardinality_is_exact = True
1137
+ else:
1138
+ MAX_HASH = (1 << 64) - 1
1139
+ R = max(smallest)
1140
+ if R == 0:
1141
+ cardinality = len(set(smallest))
1142
+ else:
1143
+ cardinality = int((k - 1) * (MAX_HASH + 1) / (R + 1))
1144
+ except Exception:
1145
+ cardinality = 0
1146
+
1147
+ # distribution via distogram
1148
+ distribution = None
1149
+ if (
1150
+ s["file_hist_infos"]
1151
+ and global_min is not None
1152
+ and global_max is not None
1153
+ and global_max > global_min
1154
+ ):
1155
+ try:
1156
+ from opteryx_catalog.maki_nage.distogram import Distogram
1157
+ from opteryx_catalog.maki_nage.distogram import count as _count_dist
1158
+ from opteryx_catalog.maki_nage.distogram import count_up_to as _count_up_to
1159
+ from opteryx_catalog.maki_nage.distogram import merge as _merge_distogram
1160
+ from opteryx_catalog.maki_nage.distogram import update as _update_distogram
1161
+
1162
+ dist_bin_count = max(50, bins * 5)
1163
+ global_d = Distogram(bin_count=dist_bin_count)
1164
+ for fmin, fmax, counts in s["file_hist_infos"]:
1165
+ fbins = len(counts)
1166
+ if fbins <= 0:
1167
+ continue
1168
+ temp = Distogram(bin_count=dist_bin_count)
1169
+ span = float(fmax - fmin) if fmax != fmin else 0.0
1170
+ for bi, cnt in enumerate(counts):
1171
+ if cnt <= 0:
1172
+ continue
1173
+ if span == 0.0:
1174
+ rep = float(fmin)
1175
+ else:
1176
+ rep = fmin + (bi + 0.5) * span / fbins
1177
+ _update_distogram(temp, float(rep), int(cnt))
1178
+ global_d = _merge_distogram(global_d, temp)
1179
+
1180
+ distribution = [0] * bins
1181
+ total = int(_count_dist(global_d) or 0)
1182
+ if total == 0:
1183
+ distribution = [0] * bins
1184
+ else:
1185
+ prev = 0.0
1186
+ gmin = float(global_min)
1187
+ gmax = float(global_max)
1188
+ for i in range(1, bins + 1):
1189
+ edge = gmin + (i / bins) * (gmax - gmin)
1190
+ cum = _count_up_to(global_d, edge) or 0.0
1191
+ distribution[i - 1] = int(round(cum - prev))
1192
+ prev = cum
1193
+ diff = total - sum(distribution)
1194
+ if diff != 0:
1195
+ distribution[-1] += diff
1196
+ except Exception:
1197
+ distribution = [0] * bins
1198
+ gspan = float(global_max - global_min)
1199
+ for fmin, fmax, counts in s["file_hist_infos"]:
1200
+ fbins = len(counts)
1201
+ if fbins <= 0:
1202
+ continue
1203
+ for bi, cnt in enumerate(counts):
1204
+ if cnt <= 0:
1205
+ continue
1206
+ rep = fmin + (bi + 0.5) * (fmax - fmin) / fbins
1207
+ gi = int((rep - global_min) / gspan * bins)
1208
+ if gi < 0:
1209
+ gi = 0
1210
+ if gi >= bins:
1211
+ gi = bins - 1
1212
+ distribution[gi] += int(cnt)
1213
+
1214
+ res = {
1215
+ "dataset": self.identifier,
1216
+ "description": getattr(self.metadata, "description", None),
1217
+ "row_count": total_rows,
1218
+ "column": cname,
1219
+ "min": global_min,
1220
+ "max": global_max,
1221
+ "null_count": s["null_count"],
1222
+ "uncompressed_bytes": s["uncompressed_bytes"],
1223
+ "cardinality": cardinality,
1224
+ "cardinality_is_exact": cardinality_is_exact,
1225
+ "distribution": distribution,
1226
+ }
1227
+
1228
+ # If textual, attempt display prefixes like describe()
1229
+ try:
1230
+ is_text = False
1231
+ if orso_schema is not None:
1232
+ col = orso_schema.columns[cidx]
1233
+ ctype = getattr(col, "type", None)
1234
+ if ctype is not None:
1235
+ sctype = str(ctype).lower()
1236
+ if "char" in sctype or "string" in sctype or "varchar" in sctype:
1237
+ is_text = True
1238
+ except Exception:
1239
+ is_text = False
1240
+
1241
+ if is_text:
1242
+ # Use only textual display values collected from manifests.
1243
+ # Decode bytes and strip truncation marker (0xFF) if present.
1244
+ def _decode_display_raw(v):
1245
+ if v is None:
1246
+ return None
1247
+ try:
1248
+ if isinstance(v, (bytes, bytearray, memoryview)):
1249
+ b = bytes(v)
1250
+ if b and b[-1] == 0xFF:
1251
+ b = b[:-1]
1252
+ s_val = b.decode("utf-8", errors="replace")
1253
+ return s_val[:16]
1254
+ if isinstance(v, str):
1255
+ return v[:16]
1256
+ except Exception:
1257
+ return None
1258
+ return None
1259
+
1260
+ min_disp = None
1261
+ max_disp = None
1262
+ try:
1263
+ if s.get("min_displays"):
1264
+ for v in s.get("min_displays"):
1265
+ dv = _decode_display_raw(v)
1266
+ if dv:
1267
+ min_disp = dv
1268
+ break
1269
+ if s.get("max_displays"):
1270
+ for v in s.get("max_displays"):
1271
+ dv = _decode_display_raw(v)
1272
+ if dv:
1273
+ max_disp = dv
1274
+ break
1275
+ except Exception:
1276
+ min_disp = None
1277
+ max_disp = None
1278
+
1279
+ if min_disp is not None or max_disp is not None:
1280
+ res["min_display"] = min_disp
1281
+ res["max_display"] = max_disp
1282
+
1283
+ results[cname] = res
1284
+
1285
+ return results
1286
+
1287
+ def refresh_manifest(self, agent: str, author: Optional[str] = None) -> Optional[int]:
1288
+ """Refresh manifest statistics and create a new snapshot.
1289
+
1290
+ - `agent`: identifier for the agent performing the refresh (string)
1291
+ - `author`: optional author to record; if omitted uses current snapshot author
1292
+
1293
+ This recalculates per-file statistics (min/max, record counts, sizes)
1294
+ for every file in the current manifest, writes a new manifest and
1295
+ creates a new snapshot with `user_created=False` and
1296
+ `operation_type='statistics-refresh'`.
1297
+
1298
+ Returns the new `snapshot_id` on success or None on failure.
1299
+ """
1300
+ prev = self.snapshot(None)
1301
+ if prev is None or not getattr(prev, "manifest_list", None):
1302
+ raise ValueError("No current manifest available to refresh")
1303
+
1304
+ # Use same author/commit-timestamp as previous snapshot unless overridden
1305
+ use_author = author if author is not None else getattr(prev, "author", None)
1306
+
1307
+ snapshot_id = int(time.time() * 1000)
1308
+
1309
+ # Rebuild manifest entries by re-reading each data file
1310
+ entries = []
1311
+ try:
1312
+ # Read previous manifest entries
1313
+ inp = self.io.new_input(prev.manifest_list)
1314
+ with inp.open() as f:
1315
+ prev_data = f.read()
1316
+ import pyarrow as pa
1317
+ import pyarrow.parquet as pq
1318
+
1319
+ # the manifest is a parquet file, read into a pyarrow Table
1320
+ prev_manifest = pq.read_table(pa.BufferReader(prev_data))
1321
+ prev_rows = prev_manifest.to_pylist()
1322
+ except Exception:
1323
+ prev_rows = []
1324
+
1325
+ total_files = 0
1326
+ total_size = 0
1327
+ total_data_size = 0
1328
+ total_records = 0
1329
+
1330
+ for ent in prev_rows:
1331
+ if not isinstance(ent, dict):
1332
+ continue
1333
+ fp = ent.get("file_path")
1334
+ if not fp:
1335
+ continue
1336
+ try:
1337
+ inp = self.io.new_input(fp)
1338
+ with inp.open() as f:
1339
+ data = f.read()
1340
+ # Full statistics including histograms and k-hashes
1341
+ file_size = len(data)
1342
+ manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
1343
+ dent = manifest_entry.to_dict()
1344
+ except Exception:
1345
+ # Fall back to original entry if re-read fails
1346
+ dent = ent
1347
+
1348
+ entries.append(dent)
1349
+ total_files += 1
1350
+ total_size += int(dent.get("file_size_in_bytes") or 0)
1351
+ total_data_size += int(dent.get("uncompressed_size_in_bytes") or 0)
1352
+ total_records += int(dent.get("record_count") or 0)
1353
+
1354
+ # write new manifest
1355
+ manifest_path = self.catalog.write_parquet_manifest(
1356
+ snapshot_id, entries, self.metadata.location
1357
+ )
1358
+
1359
+ # Build summary
1360
+ summary = {
1361
+ "added-data-files": 0,
1362
+ "added-files-size": 0,
1363
+ "added-data-size": 0,
1364
+ "added-records": 0,
1365
+ "deleted-data-files": 0,
1366
+ "deleted-files-size": 0,
1367
+ "deleted-data-size": 0,
1368
+ "deleted-records": 0,
1369
+ "total-data-files": total_files,
1370
+ "total-files-size": total_size,
1371
+ "total-data-size": total_data_size,
1372
+ "total-records": total_records,
1373
+ }
1374
+
1375
+ # sequence number
1376
+ try:
1377
+ next_seq = self._next_sequence_number()
1378
+ except Exception:
1379
+ next_seq = 1
1380
+
1381
+ parent_id = self.metadata.current_snapshot_id
1382
+
1383
+ # Agent committer metadata
1384
+ agent_meta = {
1385
+ "timestamp": int(time.time() * 1000),
1386
+ "action": "statistics-refresh",
1387
+ "agent": agent,
1388
+ }
1389
+
1390
+ snap = Snapshot(
1391
+ snapshot_id=snapshot_id,
1392
+ timestamp_ms=getattr(prev, "timestamp_ms", snapshot_id),
1393
+ author=use_author,
1394
+ sequence_number=next_seq,
1395
+ user_created=False,
1396
+ operation_type="statistics-refresh",
1397
+ parent_snapshot_id=parent_id,
1398
+ manifest_list=manifest_path,
1399
+ schema_id=self.metadata.current_schema_id,
1400
+ commit_message=getattr(prev, "commit_message", "statistics refresh"),
1401
+ summary=summary,
1402
+ )
1403
+
1404
+ # attach agent metadata under summary
1405
+ if snap.summary is None:
1406
+ snap.summary = {}
1407
+ snap.summary["agent-committer"] = agent_meta
1408
+
1409
+ # update in-memory metadata
1410
+ self.metadata.snapshots.append(snap)
1411
+ self.metadata.current_snapshot_id = snapshot_id
1412
+
1413
+ # persist
1414
+ if self.catalog and hasattr(self.catalog, "save_snapshot"):
1415
+ self.catalog.save_snapshot(self.identifier, snap)
1416
+ if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
1417
+ self.catalog.save_dataset_metadata(self.identifier, self.metadata)
1418
+
1419
+ return snapshot_id
1420
+
1094
1421
  def truncate(self, author: str = None, commit_message: Optional[str] = None) -> None:
1095
- """Delete all data files and manifests for this table.
1422
+ """Delete all data files and manifests for this dataset.
1096
1423
 
1097
1424
  This attempts to delete every data file referenced by existing
1098
1425
  Parquet manifests and then delete the manifest files themselves.
@@ -1109,6 +1436,7 @@ class SimpleDataset(Dataset):
1109
1436
  snaps = list(self.metadata.snapshots)
1110
1437
  removed_files = []
1111
1438
  removed_total_size = 0
1439
+ removed_data_size = 0
1112
1440
 
1113
1441
  for snap in snaps:
1114
1442
  manifest_path = getattr(snap, "manifest_list", None)
@@ -1118,31 +1446,34 @@ class SimpleDataset(Dataset):
1118
1446
  # Read manifest via FileIO if available
1119
1447
  rows = []
1120
1448
  try:
1121
- if hasattr(io, "new_input"):
1122
- inp = io.new_input(manifest_path)
1123
- with inp.open() as f:
1124
- data = f.read()
1125
- table = pq.read_table(pa.BufferReader(data))
1126
- rows = table.to_pylist()
1449
+ inp = io.new_input(manifest_path)
1450
+ with inp.open() as f:
1451
+ data = f.read()
1452
+ table = pq.read_table(pa.BufferReader(data))
1453
+ rows = table.to_pylist()
1127
1454
  except Exception:
1128
1455
  rows = []
1129
1456
 
1130
1457
  for r in rows:
1131
1458
  fp = None
1132
1459
  fsize = 0
1460
+ data_size = 0
1133
1461
  if isinstance(r, dict):
1134
1462
  fp = r.get("file_path")
1135
1463
  fsize = int(r.get("file_size_in_bytes") or 0)
1464
+ data_size = int(r.get("uncompressed_size_in_bytes") or 0)
1136
1465
  if not fp and "data_file" in r and isinstance(r["data_file"], dict):
1137
1466
  fp = r["data_file"].get("file_path") or r["data_file"].get("path")
1138
1467
  fsize = int(r["data_file"].get("file_size_in_bytes") or 0)
1468
+ data_size = int(r["data_file"].get("uncompressed_size_in_bytes") or 0)
1139
1469
 
1140
1470
  if fp:
1141
1471
  removed_files.append(fp)
1142
1472
  removed_total_size += fsize
1473
+ removed_data_size += data_size
1143
1474
 
1144
1475
  # Create a new empty Parquet manifest (entries=[]) to represent the
1145
- # truncated table for the new snapshot. Do not delete objects.
1476
+ # truncated dataset for the new snapshot. Do not delete objects.
1146
1477
  snapshot_id = int(time.time() * 1000)
1147
1478
 
1148
1479
  # Do NOT write an empty Parquet manifest when there are no entries.
@@ -1157,29 +1488,21 @@ class SimpleDataset(Dataset):
1157
1488
  summary = {
1158
1489
  "added-data-files": 0,
1159
1490
  "added-files-size": 0,
1491
+ "added-data-size": 0,
1160
1492
  "added-records": 0,
1161
1493
  "deleted-data-files": deleted_count,
1162
1494
  "deleted-files-size": deleted_size,
1495
+ "deleted-data-size": removed_data_size,
1163
1496
  "deleted-records": 0,
1164
1497
  "total-data-files": 0,
1165
1498
  "total-files-size": 0,
1499
+ "total-data-size": 0,
1166
1500
  "total-records": 0,
1167
1501
  }
1168
1502
 
1169
1503
  # Sequence number
1170
1504
  try:
1171
- max_seq = 0
1172
- for s in self.metadata.snapshots:
1173
- seq = getattr(s, "sequence_number", None)
1174
- if seq is None:
1175
- continue
1176
- try:
1177
- ival = int(seq)
1178
- except Exception:
1179
- continue
1180
- if ival > max_seq:
1181
- max_seq = ival
1182
- next_seq = max_seq + 1
1505
+ next_seq = self._next_sequence_number()
1183
1506
  except Exception:
1184
1507
  next_seq = 1
1185
1508
 
@@ -1215,7 +1538,4 @@ class SimpleDataset(Dataset):
1215
1538
  self.metadata.current_snapshot_id = snapshot_id
1216
1539
 
1217
1540
  if self.catalog and hasattr(self.catalog, "save_snapshot"):
1218
- try:
1219
- self.catalog.save_snapshot(self.identifier, snap)
1220
- except Exception:
1221
- pass
1541
+ self.catalog.save_snapshot(self.identifier, snap)