opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. opteryx_catalog/catalog/compaction.py +15 -8
  2. opteryx_catalog/catalog/dataset.py +449 -111
  3. opteryx_catalog/catalog/manifest.py +390 -330
  4. opteryx_catalog/catalog/metadata.py +3 -0
  5. opteryx_catalog/iops/fileio.py +13 -0
  6. opteryx_catalog/maki_nage/__init__.py +8 -0
  7. opteryx_catalog/maki_nage/distogram.py +558 -0
  8. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  9. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  10. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  11. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  12. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  13. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  14. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  15. opteryx_catalog/opteryx_catalog.py +82 -54
  16. opteryx_catalog/webhooks/__init__.py +230 -0
  17. opteryx_catalog/webhooks/events.py +177 -0
  18. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  19. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  20. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  21. scripts/collect_byte_counts.py +42 -0
  22. scripts/emit_full_single_file.py +81 -0
  23. scripts/inspect_manifest_dryrun.py +322 -0
  24. scripts/inspect_single_file.py +147 -0
  25. scripts/inspect_single_file_gcs.py +124 -0
  26. tests/test_collections.py +37 -0
  27. tests/test_describe_uncompressed.py +127 -0
  28. tests/test_refresh_manifest.py +275 -0
  29. tests/test_webhooks.py +177 -0
  30. opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
  31. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  32. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -9,8 +9,7 @@ from typing import Iterable
9
9
  from typing import Optional
10
10
 
11
11
  from .manifest import ParquetManifestEntry
12
- from .manifest import build_parquet_manifest_entry
13
- from .manifest import build_parquet_manifest_minmax_entry
12
+ from .manifest import build_parquet_manifest_entry_from_bytes
14
13
  from .metadata import DatasetMetadata
15
14
  from .metadata import Snapshot
16
15
  from .metastore import Dataset
@@ -381,16 +380,20 @@ class SimpleDataset(Dataset):
381
380
  import pyarrow as pa
382
381
  import pyarrow.parquet as pq
383
382
 
383
+ from ..iops.fileio import WRITE_PARQUET_OPTIONS
384
+
384
385
  buf = pa.BufferOutputStream()
385
- pq.write_table(table, buf, compression="zstd")
386
+ pq.write_table(table, buf, **WRITE_PARQUET_OPTIONS)
386
387
  pdata = buf.getvalue().to_pybytes()
387
388
 
388
389
  out = self.io.new_output(data_path).create()
389
390
  out.write(pdata)
390
391
  out.close()
391
392
 
392
- # Build manifest entry with statistics
393
- manifest_entry = build_parquet_manifest_entry(table, data_path, len(pdata))
393
+ # Build manifest entry with statistics using a bytes-based, per-column scan
394
+ manifest_entry = build_parquet_manifest_entry_from_bytes(
395
+ pdata, data_path, len(pdata), orig_table=table
396
+ )
394
397
  return manifest_entry
395
398
 
396
399
  def overwrite(self, table: Any, author: str = None, commit_message: Optional[str] = None):
@@ -542,9 +545,7 @@ class SimpleDataset(Dataset):
542
545
  }
543
546
 
544
547
  # Build new entries for files that don't already exist. Only accept
545
- # Parquet files and attempt to read lightweight metadata (bytes,
546
- # row count, per-column min/max) from the Parquet footer when
547
- # available.
548
+ # Parquet files and compute full statistics for each file.
548
549
  new_entries = []
549
550
  seen = set()
550
551
  for fp in files:
@@ -555,15 +556,19 @@ class SimpleDataset(Dataset):
555
556
  continue
556
557
  seen.add(fp)
557
558
 
558
- # Attempt to read file bytes and parquet metadata
559
- # Use rugo's metadata reader which is much faster (microseconds per file)
559
+ # Read file and compute full statistics
560
560
  try:
561
+ import pyarrow as pa
562
+ import pyarrow.parquet as pq
563
+
561
564
  inp = self.io.new_input(fp)
562
565
  with inp.open() as f:
563
566
  data = f.read()
564
567
 
565
568
  if data:
566
- manifest_entry = build_parquet_manifest_minmax_entry(data, fp)
569
+ # Compute statistics using a single read of the compressed bytes
570
+ file_size = len(data)
571
+ manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
567
572
  else:
568
573
  # Empty file, create placeholder entry
569
574
  manifest_entry = ParquetManifestEntry(
@@ -581,7 +586,7 @@ class SimpleDataset(Dataset):
581
586
  max_values=[],
582
587
  )
583
588
  except Exception:
584
- # If metadata read fails, fall back to placeholders
589
+ # If read fails, fall back to placeholders
585
590
  manifest_entry = ParquetManifestEntry(
586
591
  file_path=fp,
587
592
  file_format="parquet",
@@ -612,9 +617,10 @@ class SimpleDataset(Dataset):
612
617
  added_files_size = 0
613
618
  added_data_size = 0
614
619
  added_records = 0
615
- # Sum uncompressed sizes from new entries
620
+ # Sum statistics from new entries
616
621
  for entry in new_entries:
617
622
  added_data_size += entry.get("uncompressed_size_in_bytes", 0)
623
+ added_records += entry.get("record_count", 0)
618
624
  deleted_data_files = 0
619
625
  deleted_files_size = 0
620
626
  deleted_data_size = 0
@@ -711,7 +717,7 @@ class SimpleDataset(Dataset):
711
717
  prev_total_records = 0
712
718
 
713
719
  # Build unique new entries (ignore duplicates in input). Only accept
714
- # parquet files and try to read lightweight metadata from each file.
720
+ # parquet files and compute full statistics for each file.
715
721
  new_entries = []
716
722
  seen = set()
717
723
  for fp in files:
@@ -721,14 +727,7 @@ class SimpleDataset(Dataset):
721
727
  continue
722
728
  seen.add(fp)
723
729
 
724
- file_size = 0
725
- record_count = 0
726
- min_values = []
727
- max_values = []
728
730
  try:
729
- import pyarrow as pa
730
- import pyarrow.parquet as pq
731
-
732
731
  data = None
733
732
  if self.io and hasattr(self.io, "new_input"):
734
733
  inp = self.io.new_input(fp)
@@ -748,89 +747,41 @@ class SimpleDataset(Dataset):
748
747
  data = blob.download_as_bytes()
749
748
 
750
749
  if data:
750
+ # Compute statistics using a single read of the compressed bytes
751
751
  file_size = len(data)
752
- pf = pq.ParquetFile(pa.BufferReader(data))
753
- record_count = int(pf.metadata.num_rows or 0)
754
-
755
- ncols = pf.metadata.num_columns
756
- mins = [None] * ncols
757
- maxs = [None] * ncols
758
- null_counts = [0] * ncols
759
- for rg in range(pf.num_row_groups):
760
- for ci in range(ncols):
761
- col_meta = pf.metadata.row_group(rg).column(ci)
762
- stats = getattr(col_meta, "statistics", None)
763
- if not stats:
764
- continue
765
- smin = getattr(stats, "min", None)
766
- smax = getattr(stats, "max", None)
767
- snull_count = getattr(stats, "null_count", None)
768
- if smin is None and smax is None and snull_count is None:
769
- continue
770
-
771
- def _to_py(v):
772
- try:
773
- return int(v)
774
- except Exception:
775
- try:
776
- return float(v)
777
- except Exception:
778
- try:
779
- if isinstance(v, (bytes, bytearray)):
780
- return v.decode("utf-8", errors="ignore")
781
- except Exception:
782
- pass
783
- return v
784
-
785
- if smin is not None:
786
- sval = _to_py(smin)
787
- if mins[ci] is None:
788
- mins[ci] = sval
789
- else:
790
- try:
791
- if sval < mins[ci]:
792
- mins[ci] = sval
793
- except Exception:
794
- pass
795
- if smax is not None:
796
- sval = _to_py(smax)
797
- if maxs[ci] is None:
798
- maxs[ci] = sval
799
- else:
800
- try:
801
- if sval > maxs[ci]:
802
- maxs[ci] = sval
803
- except Exception:
804
- pass
805
- if snull_count is not None:
806
- try:
807
- null_counts[ci] += int(snull_count)
808
- except Exception:
809
- pass
810
-
811
- min_values = [m for m in mins if m is not None]
812
- max_values = [m for m in maxs if m is not None]
752
+ manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
753
+ else:
754
+ # Empty file, create placeholder entry
755
+ manifest_entry = ParquetManifestEntry(
756
+ file_path=fp,
757
+ file_format="parquet",
758
+ record_count=0,
759
+ null_counts=[],
760
+ file_size_in_bytes=0,
761
+ uncompressed_size_in_bytes=0,
762
+ column_uncompressed_sizes_in_bytes=[],
763
+ min_k_hashes=[],
764
+ histogram_counts=[],
765
+ histogram_bins=0,
766
+ min_values=[],
767
+ max_values=[],
768
+ )
813
769
  except Exception:
814
- file_size = 0
815
- record_count = 0
816
- min_values = []
817
- max_values = []
818
- null_counts = []
819
-
820
- manifest_entry = ParquetManifestEntry(
821
- file_path=fp,
822
- file_format="parquet",
823
- record_count=int(record_count),
824
- null_counts=null_counts,
825
- file_size_in_bytes=int(file_size),
826
- uncompressed_size_in_bytes=int(file_size), # Use compressed size as estimate
827
- column_uncompressed_sizes_in_bytes=[],
828
- min_k_hashes=[],
829
- histogram_counts=[],
830
- histogram_bins=0,
831
- min_values=min_values,
832
- max_values=max_values,
833
- )
770
+ # If read fails, create placeholder entry
771
+ manifest_entry = ParquetManifestEntry(
772
+ file_path=fp,
773
+ file_format="parquet",
774
+ record_count=0,
775
+ null_counts=[],
776
+ file_size_in_bytes=0,
777
+ uncompressed_size_in_bytes=0,
778
+ column_uncompressed_sizes_in_bytes=[],
779
+ min_k_hashes=[],
780
+ histogram_counts=[],
781
+ histogram_bins=0,
782
+ min_values=[],
783
+ max_values=[],
784
+ )
834
785
  new_entries.append(manifest_entry.to_dict())
835
786
 
836
787
  manifest_path = None
@@ -850,10 +801,11 @@ class SimpleDataset(Dataset):
850
801
  added_data_files = len(new_entries)
851
802
  added_files_size = 0
852
803
  added_data_size = 0
853
- # Sum uncompressed sizes from new entries
804
+ added_records = 0
805
+ # Sum statistics from new entries
854
806
  for entry in new_entries:
855
807
  added_data_size += entry.get("uncompressed_size_in_bytes", 0)
856
- added_records = 0
808
+ added_records += entry.get("record_count", 0)
857
809
 
858
810
  total_data_files = added_data_files
859
811
  total_files_size = added_files_size
@@ -909,9 +861,7 @@ class SimpleDataset(Dataset):
909
861
  if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
910
862
  self.catalog.save_dataset_metadata(self.identifier, self.metadata)
911
863
 
912
- def scan(
913
- self, row_filter=None, snapshot_id: Optional[int] = None
914
- ) -> Iterable[Datafile]:
864
+ def scan(self, row_filter=None, snapshot_id: Optional[int] = None) -> Iterable[Datafile]:
915
865
  """Return Datafile objects for the given snapshot.
916
866
 
917
867
  - If `snapshot_id` is None, use the current snapshot.
@@ -947,6 +897,393 @@ class SimpleDataset(Dataset):
947
897
  except Exception:
948
898
  return iter(())
949
899
 
900
+ def describe(self, snapshot_id: Optional[int] = None, bins: int = 10) -> dict:
901
+ """Describe all schema columns for the given snapshot.
902
+
903
+ Returns a dict mapping column name -> statistics (same shape as
904
+ the previous `describe` per-column output).
905
+ """
906
+ import heapq
907
+
908
+ snap = self.snapshot(snapshot_id)
909
+ if snap is None or not getattr(snap, "manifest_list", None):
910
+ raise ValueError("No manifest available for this dataset/snapshot")
911
+
912
+ manifest_path = snap.manifest_list
913
+
914
+ # Read manifest once
915
+ try:
916
+ import pyarrow as pa
917
+ import pyarrow.parquet as pq
918
+
919
+ inp = self.io.new_input(manifest_path)
920
+ with inp.open() as f:
921
+ data = f.read()
922
+
923
+ if not data:
924
+ raise ValueError("Empty manifest data")
925
+
926
+ table = pq.read_table(pa.BufferReader(data))
927
+ entries = table.to_pylist()
928
+ except Exception:
929
+ raise
930
+
931
+ # Resolve schema and describe all columns
932
+ orso_schema = None
933
+ try:
934
+ orso_schema = self.schema()
935
+ except Exception:
936
+ orso_schema = None
937
+
938
+ if orso_schema is None:
939
+ raise ValueError("Schema unavailable; cannot describe all columns")
940
+
941
+ # Map column name -> index for every schema column
942
+ col_to_idx: dict[str, int] = {c.name: i for i, c in enumerate(orso_schema.columns)}
943
+
944
+ # Initialize accumulators per column
945
+ stats: dict[str, dict] = {}
946
+ for name in col_to_idx:
947
+ stats[name] = {
948
+ "null_count": 0,
949
+ "mins": [],
950
+ "maxs": [],
951
+ "hashes": set(),
952
+ "file_hist_infos": [],
953
+ "min_displays": [],
954
+ "max_displays": [],
955
+ "uncompressed_bytes": 0,
956
+ }
957
+
958
+ total_rows = 0
959
+
960
+ def _decode_minmax(v):
961
+ if v is None:
962
+ return None
963
+ if isinstance(v, (int, float)):
964
+ return v
965
+ # For strings stored as string values (not bytes), return as-is
966
+ if isinstance(v, str):
967
+ # Try to parse as number for backward compatibility
968
+ try:
969
+ return int(v)
970
+ except Exception:
971
+ try:
972
+ return float(v)
973
+ except Exception:
974
+ # Not a number, return the string itself for display
975
+ return v
976
+ try:
977
+ if isinstance(v, (bytes, bytearray, memoryview)):
978
+ b = bytes(v)
979
+ if b and b[-1] == 0xFF:
980
+ b = b[:-1]
981
+ s = b.decode("utf-8")
982
+ try:
983
+ return int(s)
984
+ except Exception:
985
+ try:
986
+ return float(s)
987
+ except Exception:
988
+ # Decoded bytes that aren't numbers, return as string
989
+ return s
990
+ except Exception:
991
+ pass
992
+ return None
993
+
994
+ # Single pass through entries updating per-column accumulators
995
+ for ent in entries:
996
+ if not isinstance(ent, dict):
997
+ continue
998
+ total_rows += int(ent.get("record_count") or 0)
999
+
1000
+ # prefetch lists
1001
+ ncounts = ent.get("null_counts") or []
1002
+ mks = ent.get("min_k_hashes") or []
1003
+ hists = ent.get("histogram_counts") or []
1004
+ mv = ent.get("min_values") or []
1005
+ xv = ent.get("max_values") or []
1006
+ mv_disp = ent.get("min_values_display") or []
1007
+ xv_disp = ent.get("max_values_display") or []
1008
+ col_sizes = ent.get("column_uncompressed_sizes_in_bytes") or []
1009
+
1010
+ for cname, cidx in col_to_idx.items():
1011
+ # nulls
1012
+ try:
1013
+ stats[cname]["null_count"] += int((ncounts or [0])[cidx])
1014
+ except Exception:
1015
+ pass
1016
+
1017
+ # mins/maxs
1018
+ try:
1019
+ raw_min = mv[cidx]
1020
+ except Exception:
1021
+ raw_min = None
1022
+ try:
1023
+ raw_max = xv[cidx]
1024
+ except Exception:
1025
+ raw_max = None
1026
+ dmin = _decode_minmax(raw_min)
1027
+ dmax = _decode_minmax(raw_max)
1028
+ if dmin is not None:
1029
+ stats[cname]["mins"].append(dmin)
1030
+ if dmax is not None:
1031
+ stats[cname]["maxs"].append(dmax)
1032
+
1033
+ # collect textual display values when present
1034
+ try:
1035
+ try:
1036
+ raw_min_disp = mv_disp[cidx]
1037
+ except Exception:
1038
+ raw_min_disp = None
1039
+ try:
1040
+ raw_max_disp = xv_disp[cidx]
1041
+ except Exception:
1042
+ raw_max_disp = None
1043
+
1044
+ def _decode_display(v):
1045
+ if v is None:
1046
+ return None
1047
+ try:
1048
+ if isinstance(v, (bytes, bytearray, memoryview)):
1049
+ b = bytes(v)
1050
+ if b and b[-1] == 0xFF:
1051
+ b = b[:-1]
1052
+ return b.decode("utf-8", errors="replace")
1053
+ if isinstance(v, str):
1054
+ return v
1055
+ except Exception:
1056
+ return None
1057
+ return None
1058
+
1059
+ md = _decode_display(raw_min_disp)
1060
+ xd = _decode_display(raw_max_disp)
1061
+ if md is not None:
1062
+ stats[cname]["min_displays"].append(md)
1063
+ if xd is not None:
1064
+ stats[cname]["max_displays"].append(xd)
1065
+ except Exception:
1066
+ pass
1067
+
1068
+ # min-k hashes
1069
+ try:
1070
+ col_mk = mks[cidx] or []
1071
+ except Exception:
1072
+ col_mk = []
1073
+ for h in col_mk:
1074
+ try:
1075
+ stats[cname]["hashes"].add(int(h))
1076
+ except Exception:
1077
+ pass
1078
+
1079
+ # histograms
1080
+ try:
1081
+ col_hist = hists[cidx]
1082
+ except Exception:
1083
+ col_hist = []
1084
+ if col_hist:
1085
+ try:
1086
+ if dmin is not None and dmax is not None and dmin != dmax:
1087
+ stats[cname]["file_hist_infos"].append(
1088
+ (float(dmin), float(dmax), list(col_hist))
1089
+ )
1090
+ except Exception:
1091
+ pass
1092
+
1093
+ # uncompressed bytes for this column (sum across files)
1094
+ try:
1095
+ stats[cname]["uncompressed_bytes"] += int((col_sizes or [0])[cidx])
1096
+ except Exception:
1097
+ pass
1098
+
1099
+ # Build results per column
1100
+ results: dict[str, dict] = {}
1101
+ for cname, cidx in col_to_idx.items():
1102
+ s = stats[cname]
1103
+ # Handle mixed types: separate strings from numbers
1104
+ mins_filtered = [v for v in s["mins"] if v is not None]
1105
+ maxs_filtered = [v for v in s["maxs"] if v is not None]
1106
+
1107
+ # Group by type: strings vs numbers
1108
+ str_mins = [v for v in mins_filtered if isinstance(v, str)]
1109
+ num_mins = [v for v in mins_filtered if not isinstance(v, str)]
1110
+ str_maxs = [v for v in maxs_filtered if isinstance(v, str)]
1111
+ num_maxs = [v for v in maxs_filtered if not isinstance(v, str)]
1112
+
1113
+ # Use whichever type has values (strings take precedence for text columns)
1114
+ global_min = None
1115
+ global_max = None
1116
+ if str_mins:
1117
+ global_min = min(str_mins)
1118
+ elif num_mins:
1119
+ global_min = min(num_mins)
1120
+
1121
+ if str_maxs:
1122
+ global_max = max(str_maxs)
1123
+ elif num_maxs:
1124
+ global_max = max(num_maxs)
1125
+
1126
+ # kmv approx
1127
+ cardinality = 0
1128
+ cardinality_is_exact = False
1129
+ try:
1130
+ collected = s["hashes"]
1131
+ if collected:
1132
+ smallest = heapq.nsmallest(32, collected)
1133
+ k = len(smallest)
1134
+ if k < 31:
1135
+ cardinality = len(set(smallest))
1136
+ cardinality_is_exact = True
1137
+ else:
1138
+ MAX_HASH = (1 << 64) - 1
1139
+ R = max(smallest)
1140
+ if R == 0:
1141
+ cardinality = len(set(smallest))
1142
+ else:
1143
+ cardinality = int((k - 1) * (MAX_HASH + 1) / (R + 1))
1144
+ except Exception:
1145
+ cardinality = 0
1146
+
1147
+ # distribution via distogram
1148
+ distribution = None
1149
+ if (
1150
+ s["file_hist_infos"]
1151
+ and global_min is not None
1152
+ and global_max is not None
1153
+ and global_max > global_min
1154
+ ):
1155
+ try:
1156
+ from opteryx_catalog.maki_nage.distogram import Distogram
1157
+ from opteryx_catalog.maki_nage.distogram import count as _count_dist
1158
+ from opteryx_catalog.maki_nage.distogram import count_up_to as _count_up_to
1159
+ from opteryx_catalog.maki_nage.distogram import merge as _merge_distogram
1160
+ from opteryx_catalog.maki_nage.distogram import update as _update_distogram
1161
+
1162
+ dist_bin_count = max(50, bins * 5)
1163
+ global_d = Distogram(bin_count=dist_bin_count)
1164
+ for fmin, fmax, counts in s["file_hist_infos"]:
1165
+ fbins = len(counts)
1166
+ if fbins <= 0:
1167
+ continue
1168
+ temp = Distogram(bin_count=dist_bin_count)
1169
+ span = float(fmax - fmin) if fmax != fmin else 0.0
1170
+ for bi, cnt in enumerate(counts):
1171
+ if cnt <= 0:
1172
+ continue
1173
+ if span == 0.0:
1174
+ rep = float(fmin)
1175
+ else:
1176
+ rep = fmin + (bi + 0.5) * span / fbins
1177
+ _update_distogram(temp, float(rep), int(cnt))
1178
+ global_d = _merge_distogram(global_d, temp)
1179
+
1180
+ distribution = [0] * bins
1181
+ total = int(_count_dist(global_d) or 0)
1182
+ if total == 0:
1183
+ distribution = [0] * bins
1184
+ else:
1185
+ prev = 0.0
1186
+ gmin = float(global_min)
1187
+ gmax = float(global_max)
1188
+ for i in range(1, bins + 1):
1189
+ edge = gmin + (i / bins) * (gmax - gmin)
1190
+ cum = _count_up_to(global_d, edge) or 0.0
1191
+ distribution[i - 1] = int(round(cum - prev))
1192
+ prev = cum
1193
+ diff = total - sum(distribution)
1194
+ if diff != 0:
1195
+ distribution[-1] += diff
1196
+ except Exception:
1197
+ distribution = [0] * bins
1198
+ gspan = float(global_max - global_min)
1199
+ for fmin, fmax, counts in s["file_hist_infos"]:
1200
+ fbins = len(counts)
1201
+ if fbins <= 0:
1202
+ continue
1203
+ for bi, cnt in enumerate(counts):
1204
+ if cnt <= 0:
1205
+ continue
1206
+ rep = fmin + (bi + 0.5) * (fmax - fmin) / fbins
1207
+ gi = int((rep - global_min) / gspan * bins)
1208
+ if gi < 0:
1209
+ gi = 0
1210
+ if gi >= bins:
1211
+ gi = bins - 1
1212
+ distribution[gi] += int(cnt)
1213
+
1214
+ res = {
1215
+ "dataset": self.identifier,
1216
+ "description": getattr(self.metadata, "description", None),
1217
+ "row_count": total_rows,
1218
+ "column": cname,
1219
+ "min": global_min,
1220
+ "max": global_max,
1221
+ "null_count": s["null_count"],
1222
+ "uncompressed_bytes": s["uncompressed_bytes"],
1223
+ "cardinality": cardinality,
1224
+ "cardinality_is_exact": cardinality_is_exact,
1225
+ "distribution": distribution,
1226
+ }
1227
+
1228
+ # If textual, attempt display prefixes like describe()
1229
+ try:
1230
+ is_text = False
1231
+ if orso_schema is not None:
1232
+ col = orso_schema.columns[cidx]
1233
+ ctype = getattr(col, "type", None)
1234
+ if ctype is not None:
1235
+ sctype = str(ctype).lower()
1236
+ if "char" in sctype or "string" in sctype or "varchar" in sctype:
1237
+ is_text = True
1238
+ except Exception:
1239
+ is_text = False
1240
+
1241
+ if is_text:
1242
+ # Use only textual display values collected from manifests.
1243
+ # Decode bytes and strip truncation marker (0xFF) if present.
1244
+ def _decode_display_raw(v):
1245
+ if v is None:
1246
+ return None
1247
+ try:
1248
+ if isinstance(v, (bytes, bytearray, memoryview)):
1249
+ b = bytes(v)
1250
+ if b and b[-1] == 0xFF:
1251
+ b = b[:-1]
1252
+ s_val = b.decode("utf-8", errors="replace")
1253
+ return s_val[:16]
1254
+ if isinstance(v, str):
1255
+ return v[:16]
1256
+ except Exception:
1257
+ return None
1258
+ return None
1259
+
1260
+ min_disp = None
1261
+ max_disp = None
1262
+ try:
1263
+ if s.get("min_displays"):
1264
+ for v in s.get("min_displays"):
1265
+ dv = _decode_display_raw(v)
1266
+ if dv:
1267
+ min_disp = dv
1268
+ break
1269
+ if s.get("max_displays"):
1270
+ for v in s.get("max_displays"):
1271
+ dv = _decode_display_raw(v)
1272
+ if dv:
1273
+ max_disp = dv
1274
+ break
1275
+ except Exception:
1276
+ min_disp = None
1277
+ max_disp = None
1278
+
1279
+ if min_disp is not None or max_disp is not None:
1280
+ res["min_display"] = min_disp
1281
+ res["max_display"] = max_disp
1282
+
1283
+ results[cname] = res
1284
+
1285
+ return results
1286
+
950
1287
  def refresh_manifest(self, agent: str, author: Optional[str] = None) -> Optional[int]:
951
1288
  """Refresh manifest statistics and create a new snapshot.
952
1289
 
@@ -979,8 +1316,9 @@ class SimpleDataset(Dataset):
979
1316
  import pyarrow as pa
980
1317
  import pyarrow.parquet as pq
981
1318
 
982
- prev_table = pq.read_table(pa.BufferReader(prev_data))
983
- prev_rows = prev_table.to_pylist()
1319
+ # the manifest is a parquet file, read into a pyarrow Table
1320
+ prev_manifest = pq.read_table(pa.BufferReader(prev_data))
1321
+ prev_rows = prev_manifest.to_pylist()
984
1322
  except Exception:
985
1323
  prev_rows = []
986
1324
 
@@ -1000,8 +1338,8 @@ class SimpleDataset(Dataset):
1000
1338
  with inp.open() as f:
1001
1339
  data = f.read()
1002
1340
  # Full statistics including histograms and k-hashes
1003
- table = pq.read_table(pa.BufferReader(data))
1004
- manifest_entry = build_parquet_manifest_entry(table, fp, len(data))
1341
+ file_size = len(data)
1342
+ manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
1005
1343
  dent = manifest_entry.to_dict()
1006
1344
  except Exception:
1007
1345
  # Fall back to original entry if re-read fails