opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/catalog/compaction.py +15 -8
- opteryx_catalog/catalog/dataset.py +449 -111
- opteryx_catalog/catalog/manifest.py +390 -330
- opteryx_catalog/catalog/metadata.py +3 -0
- opteryx_catalog/iops/fileio.py +13 -0
- opteryx_catalog/maki_nage/__init__.py +8 -0
- opteryx_catalog/maki_nage/distogram.py +558 -0
- opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
- opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
- opteryx_catalog/maki_nage/tests/test_count.py +19 -0
- opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
- opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
- opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
- opteryx_catalog/maki_nage/tests/test_update.py +44 -0
- opteryx_catalog/opteryx_catalog.py +82 -54
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
- opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
- scripts/collect_byte_counts.py +42 -0
- scripts/emit_full_single_file.py +81 -0
- scripts/inspect_manifest_dryrun.py +322 -0
- scripts/inspect_single_file.py +147 -0
- scripts/inspect_single_file_gcs.py +124 -0
- tests/test_collections.py +37 -0
- tests/test_describe_uncompressed.py +127 -0
- tests/test_refresh_manifest.py +275 -0
- tests/test_webhooks.py +177 -0
- opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
|
@@ -9,8 +9,7 @@ from typing import Iterable
|
|
|
9
9
|
from typing import Optional
|
|
10
10
|
|
|
11
11
|
from .manifest import ParquetManifestEntry
|
|
12
|
-
from .manifest import
|
|
13
|
-
from .manifest import build_parquet_manifest_minmax_entry
|
|
12
|
+
from .manifest import build_parquet_manifest_entry_from_bytes
|
|
14
13
|
from .metadata import DatasetMetadata
|
|
15
14
|
from .metadata import Snapshot
|
|
16
15
|
from .metastore import Dataset
|
|
@@ -381,16 +380,20 @@ class SimpleDataset(Dataset):
|
|
|
381
380
|
import pyarrow as pa
|
|
382
381
|
import pyarrow.parquet as pq
|
|
383
382
|
|
|
383
|
+
from ..iops.fileio import WRITE_PARQUET_OPTIONS
|
|
384
|
+
|
|
384
385
|
buf = pa.BufferOutputStream()
|
|
385
|
-
pq.write_table(table, buf,
|
|
386
|
+
pq.write_table(table, buf, **WRITE_PARQUET_OPTIONS)
|
|
386
387
|
pdata = buf.getvalue().to_pybytes()
|
|
387
388
|
|
|
388
389
|
out = self.io.new_output(data_path).create()
|
|
389
390
|
out.write(pdata)
|
|
390
391
|
out.close()
|
|
391
392
|
|
|
392
|
-
# Build manifest entry with statistics
|
|
393
|
-
manifest_entry =
|
|
393
|
+
# Build manifest entry with statistics using a bytes-based, per-column scan
|
|
394
|
+
manifest_entry = build_parquet_manifest_entry_from_bytes(
|
|
395
|
+
pdata, data_path, len(pdata), orig_table=table
|
|
396
|
+
)
|
|
394
397
|
return manifest_entry
|
|
395
398
|
|
|
396
399
|
def overwrite(self, table: Any, author: str = None, commit_message: Optional[str] = None):
|
|
@@ -542,9 +545,7 @@ class SimpleDataset(Dataset):
|
|
|
542
545
|
}
|
|
543
546
|
|
|
544
547
|
# Build new entries for files that don't already exist. Only accept
|
|
545
|
-
# Parquet files and
|
|
546
|
-
# row count, per-column min/max) from the Parquet footer when
|
|
547
|
-
# available.
|
|
548
|
+
# Parquet files and compute full statistics for each file.
|
|
548
549
|
new_entries = []
|
|
549
550
|
seen = set()
|
|
550
551
|
for fp in files:
|
|
@@ -555,15 +556,19 @@ class SimpleDataset(Dataset):
|
|
|
555
556
|
continue
|
|
556
557
|
seen.add(fp)
|
|
557
558
|
|
|
558
|
-
#
|
|
559
|
-
# Use rugo's metadata reader which is much faster (microseconds per file)
|
|
559
|
+
# Read file and compute full statistics
|
|
560
560
|
try:
|
|
561
|
+
import pyarrow as pa
|
|
562
|
+
import pyarrow.parquet as pq
|
|
563
|
+
|
|
561
564
|
inp = self.io.new_input(fp)
|
|
562
565
|
with inp.open() as f:
|
|
563
566
|
data = f.read()
|
|
564
567
|
|
|
565
568
|
if data:
|
|
566
|
-
|
|
569
|
+
# Compute statistics using a single read of the compressed bytes
|
|
570
|
+
file_size = len(data)
|
|
571
|
+
manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
|
|
567
572
|
else:
|
|
568
573
|
# Empty file, create placeholder entry
|
|
569
574
|
manifest_entry = ParquetManifestEntry(
|
|
@@ -581,7 +586,7 @@ class SimpleDataset(Dataset):
|
|
|
581
586
|
max_values=[],
|
|
582
587
|
)
|
|
583
588
|
except Exception:
|
|
584
|
-
# If
|
|
589
|
+
# If read fails, fall back to placeholders
|
|
585
590
|
manifest_entry = ParquetManifestEntry(
|
|
586
591
|
file_path=fp,
|
|
587
592
|
file_format="parquet",
|
|
@@ -612,9 +617,10 @@ class SimpleDataset(Dataset):
|
|
|
612
617
|
added_files_size = 0
|
|
613
618
|
added_data_size = 0
|
|
614
619
|
added_records = 0
|
|
615
|
-
# Sum
|
|
620
|
+
# Sum statistics from new entries
|
|
616
621
|
for entry in new_entries:
|
|
617
622
|
added_data_size += entry.get("uncompressed_size_in_bytes", 0)
|
|
623
|
+
added_records += entry.get("record_count", 0)
|
|
618
624
|
deleted_data_files = 0
|
|
619
625
|
deleted_files_size = 0
|
|
620
626
|
deleted_data_size = 0
|
|
@@ -711,7 +717,7 @@ class SimpleDataset(Dataset):
|
|
|
711
717
|
prev_total_records = 0
|
|
712
718
|
|
|
713
719
|
# Build unique new entries (ignore duplicates in input). Only accept
|
|
714
|
-
# parquet files and
|
|
720
|
+
# parquet files and compute full statistics for each file.
|
|
715
721
|
new_entries = []
|
|
716
722
|
seen = set()
|
|
717
723
|
for fp in files:
|
|
@@ -721,14 +727,7 @@ class SimpleDataset(Dataset):
|
|
|
721
727
|
continue
|
|
722
728
|
seen.add(fp)
|
|
723
729
|
|
|
724
|
-
file_size = 0
|
|
725
|
-
record_count = 0
|
|
726
|
-
min_values = []
|
|
727
|
-
max_values = []
|
|
728
730
|
try:
|
|
729
|
-
import pyarrow as pa
|
|
730
|
-
import pyarrow.parquet as pq
|
|
731
|
-
|
|
732
731
|
data = None
|
|
733
732
|
if self.io and hasattr(self.io, "new_input"):
|
|
734
733
|
inp = self.io.new_input(fp)
|
|
@@ -748,89 +747,41 @@ class SimpleDataset(Dataset):
|
|
|
748
747
|
data = blob.download_as_bytes()
|
|
749
748
|
|
|
750
749
|
if data:
|
|
750
|
+
# Compute statistics using a single read of the compressed bytes
|
|
751
751
|
file_size = len(data)
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
continue
|
|
770
|
-
|
|
771
|
-
def _to_py(v):
|
|
772
|
-
try:
|
|
773
|
-
return int(v)
|
|
774
|
-
except Exception:
|
|
775
|
-
try:
|
|
776
|
-
return float(v)
|
|
777
|
-
except Exception:
|
|
778
|
-
try:
|
|
779
|
-
if isinstance(v, (bytes, bytearray)):
|
|
780
|
-
return v.decode("utf-8", errors="ignore")
|
|
781
|
-
except Exception:
|
|
782
|
-
pass
|
|
783
|
-
return v
|
|
784
|
-
|
|
785
|
-
if smin is not None:
|
|
786
|
-
sval = _to_py(smin)
|
|
787
|
-
if mins[ci] is None:
|
|
788
|
-
mins[ci] = sval
|
|
789
|
-
else:
|
|
790
|
-
try:
|
|
791
|
-
if sval < mins[ci]:
|
|
792
|
-
mins[ci] = sval
|
|
793
|
-
except Exception:
|
|
794
|
-
pass
|
|
795
|
-
if smax is not None:
|
|
796
|
-
sval = _to_py(smax)
|
|
797
|
-
if maxs[ci] is None:
|
|
798
|
-
maxs[ci] = sval
|
|
799
|
-
else:
|
|
800
|
-
try:
|
|
801
|
-
if sval > maxs[ci]:
|
|
802
|
-
maxs[ci] = sval
|
|
803
|
-
except Exception:
|
|
804
|
-
pass
|
|
805
|
-
if snull_count is not None:
|
|
806
|
-
try:
|
|
807
|
-
null_counts[ci] += int(snull_count)
|
|
808
|
-
except Exception:
|
|
809
|
-
pass
|
|
810
|
-
|
|
811
|
-
min_values = [m for m in mins if m is not None]
|
|
812
|
-
max_values = [m for m in maxs if m is not None]
|
|
752
|
+
manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
|
|
753
|
+
else:
|
|
754
|
+
# Empty file, create placeholder entry
|
|
755
|
+
manifest_entry = ParquetManifestEntry(
|
|
756
|
+
file_path=fp,
|
|
757
|
+
file_format="parquet",
|
|
758
|
+
record_count=0,
|
|
759
|
+
null_counts=[],
|
|
760
|
+
file_size_in_bytes=0,
|
|
761
|
+
uncompressed_size_in_bytes=0,
|
|
762
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
763
|
+
min_k_hashes=[],
|
|
764
|
+
histogram_counts=[],
|
|
765
|
+
histogram_bins=0,
|
|
766
|
+
min_values=[],
|
|
767
|
+
max_values=[],
|
|
768
|
+
)
|
|
813
769
|
except Exception:
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
histogram_counts=[],
|
|
830
|
-
histogram_bins=0,
|
|
831
|
-
min_values=min_values,
|
|
832
|
-
max_values=max_values,
|
|
833
|
-
)
|
|
770
|
+
# If read fails, create placeholder entry
|
|
771
|
+
manifest_entry = ParquetManifestEntry(
|
|
772
|
+
file_path=fp,
|
|
773
|
+
file_format="parquet",
|
|
774
|
+
record_count=0,
|
|
775
|
+
null_counts=[],
|
|
776
|
+
file_size_in_bytes=0,
|
|
777
|
+
uncompressed_size_in_bytes=0,
|
|
778
|
+
column_uncompressed_sizes_in_bytes=[],
|
|
779
|
+
min_k_hashes=[],
|
|
780
|
+
histogram_counts=[],
|
|
781
|
+
histogram_bins=0,
|
|
782
|
+
min_values=[],
|
|
783
|
+
max_values=[],
|
|
784
|
+
)
|
|
834
785
|
new_entries.append(manifest_entry.to_dict())
|
|
835
786
|
|
|
836
787
|
manifest_path = None
|
|
@@ -850,10 +801,11 @@ class SimpleDataset(Dataset):
|
|
|
850
801
|
added_data_files = len(new_entries)
|
|
851
802
|
added_files_size = 0
|
|
852
803
|
added_data_size = 0
|
|
853
|
-
|
|
804
|
+
added_records = 0
|
|
805
|
+
# Sum statistics from new entries
|
|
854
806
|
for entry in new_entries:
|
|
855
807
|
added_data_size += entry.get("uncompressed_size_in_bytes", 0)
|
|
856
|
-
|
|
808
|
+
added_records += entry.get("record_count", 0)
|
|
857
809
|
|
|
858
810
|
total_data_files = added_data_files
|
|
859
811
|
total_files_size = added_files_size
|
|
@@ -909,9 +861,7 @@ class SimpleDataset(Dataset):
|
|
|
909
861
|
if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
|
|
910
862
|
self.catalog.save_dataset_metadata(self.identifier, self.metadata)
|
|
911
863
|
|
|
912
|
-
def scan(
|
|
913
|
-
self, row_filter=None, snapshot_id: Optional[int] = None
|
|
914
|
-
) -> Iterable[Datafile]:
|
|
864
|
+
def scan(self, row_filter=None, snapshot_id: Optional[int] = None) -> Iterable[Datafile]:
|
|
915
865
|
"""Return Datafile objects for the given snapshot.
|
|
916
866
|
|
|
917
867
|
- If `snapshot_id` is None, use the current snapshot.
|
|
@@ -947,6 +897,393 @@ class SimpleDataset(Dataset):
|
|
|
947
897
|
except Exception:
|
|
948
898
|
return iter(())
|
|
949
899
|
|
|
900
|
+
def describe(self, snapshot_id: Optional[int] = None, bins: int = 10) -> dict:
|
|
901
|
+
"""Describe all schema columns for the given snapshot.
|
|
902
|
+
|
|
903
|
+
Returns a dict mapping column name -> statistics (same shape as
|
|
904
|
+
the previous `describe` per-column output).
|
|
905
|
+
"""
|
|
906
|
+
import heapq
|
|
907
|
+
|
|
908
|
+
snap = self.snapshot(snapshot_id)
|
|
909
|
+
if snap is None or not getattr(snap, "manifest_list", None):
|
|
910
|
+
raise ValueError("No manifest available for this dataset/snapshot")
|
|
911
|
+
|
|
912
|
+
manifest_path = snap.manifest_list
|
|
913
|
+
|
|
914
|
+
# Read manifest once
|
|
915
|
+
try:
|
|
916
|
+
import pyarrow as pa
|
|
917
|
+
import pyarrow.parquet as pq
|
|
918
|
+
|
|
919
|
+
inp = self.io.new_input(manifest_path)
|
|
920
|
+
with inp.open() as f:
|
|
921
|
+
data = f.read()
|
|
922
|
+
|
|
923
|
+
if not data:
|
|
924
|
+
raise ValueError("Empty manifest data")
|
|
925
|
+
|
|
926
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
927
|
+
entries = table.to_pylist()
|
|
928
|
+
except Exception:
|
|
929
|
+
raise
|
|
930
|
+
|
|
931
|
+
# Resolve schema and describe all columns
|
|
932
|
+
orso_schema = None
|
|
933
|
+
try:
|
|
934
|
+
orso_schema = self.schema()
|
|
935
|
+
except Exception:
|
|
936
|
+
orso_schema = None
|
|
937
|
+
|
|
938
|
+
if orso_schema is None:
|
|
939
|
+
raise ValueError("Schema unavailable; cannot describe all columns")
|
|
940
|
+
|
|
941
|
+
# Map column name -> index for every schema column
|
|
942
|
+
col_to_idx: dict[str, int] = {c.name: i for i, c in enumerate(orso_schema.columns)}
|
|
943
|
+
|
|
944
|
+
# Initialize accumulators per column
|
|
945
|
+
stats: dict[str, dict] = {}
|
|
946
|
+
for name in col_to_idx:
|
|
947
|
+
stats[name] = {
|
|
948
|
+
"null_count": 0,
|
|
949
|
+
"mins": [],
|
|
950
|
+
"maxs": [],
|
|
951
|
+
"hashes": set(),
|
|
952
|
+
"file_hist_infos": [],
|
|
953
|
+
"min_displays": [],
|
|
954
|
+
"max_displays": [],
|
|
955
|
+
"uncompressed_bytes": 0,
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
total_rows = 0
|
|
959
|
+
|
|
960
|
+
def _decode_minmax(v):
|
|
961
|
+
if v is None:
|
|
962
|
+
return None
|
|
963
|
+
if isinstance(v, (int, float)):
|
|
964
|
+
return v
|
|
965
|
+
# For strings stored as string values (not bytes), return as-is
|
|
966
|
+
if isinstance(v, str):
|
|
967
|
+
# Try to parse as number for backward compatibility
|
|
968
|
+
try:
|
|
969
|
+
return int(v)
|
|
970
|
+
except Exception:
|
|
971
|
+
try:
|
|
972
|
+
return float(v)
|
|
973
|
+
except Exception:
|
|
974
|
+
# Not a number, return the string itself for display
|
|
975
|
+
return v
|
|
976
|
+
try:
|
|
977
|
+
if isinstance(v, (bytes, bytearray, memoryview)):
|
|
978
|
+
b = bytes(v)
|
|
979
|
+
if b and b[-1] == 0xFF:
|
|
980
|
+
b = b[:-1]
|
|
981
|
+
s = b.decode("utf-8")
|
|
982
|
+
try:
|
|
983
|
+
return int(s)
|
|
984
|
+
except Exception:
|
|
985
|
+
try:
|
|
986
|
+
return float(s)
|
|
987
|
+
except Exception:
|
|
988
|
+
# Decoded bytes that aren't numbers, return as string
|
|
989
|
+
return s
|
|
990
|
+
except Exception:
|
|
991
|
+
pass
|
|
992
|
+
return None
|
|
993
|
+
|
|
994
|
+
# Single pass through entries updating per-column accumulators
|
|
995
|
+
for ent in entries:
|
|
996
|
+
if not isinstance(ent, dict):
|
|
997
|
+
continue
|
|
998
|
+
total_rows += int(ent.get("record_count") or 0)
|
|
999
|
+
|
|
1000
|
+
# prefetch lists
|
|
1001
|
+
ncounts = ent.get("null_counts") or []
|
|
1002
|
+
mks = ent.get("min_k_hashes") or []
|
|
1003
|
+
hists = ent.get("histogram_counts") or []
|
|
1004
|
+
mv = ent.get("min_values") or []
|
|
1005
|
+
xv = ent.get("max_values") or []
|
|
1006
|
+
mv_disp = ent.get("min_values_display") or []
|
|
1007
|
+
xv_disp = ent.get("max_values_display") or []
|
|
1008
|
+
col_sizes = ent.get("column_uncompressed_sizes_in_bytes") or []
|
|
1009
|
+
|
|
1010
|
+
for cname, cidx in col_to_idx.items():
|
|
1011
|
+
# nulls
|
|
1012
|
+
try:
|
|
1013
|
+
stats[cname]["null_count"] += int((ncounts or [0])[cidx])
|
|
1014
|
+
except Exception:
|
|
1015
|
+
pass
|
|
1016
|
+
|
|
1017
|
+
# mins/maxs
|
|
1018
|
+
try:
|
|
1019
|
+
raw_min = mv[cidx]
|
|
1020
|
+
except Exception:
|
|
1021
|
+
raw_min = None
|
|
1022
|
+
try:
|
|
1023
|
+
raw_max = xv[cidx]
|
|
1024
|
+
except Exception:
|
|
1025
|
+
raw_max = None
|
|
1026
|
+
dmin = _decode_minmax(raw_min)
|
|
1027
|
+
dmax = _decode_minmax(raw_max)
|
|
1028
|
+
if dmin is not None:
|
|
1029
|
+
stats[cname]["mins"].append(dmin)
|
|
1030
|
+
if dmax is not None:
|
|
1031
|
+
stats[cname]["maxs"].append(dmax)
|
|
1032
|
+
|
|
1033
|
+
# collect textual display values when present
|
|
1034
|
+
try:
|
|
1035
|
+
try:
|
|
1036
|
+
raw_min_disp = mv_disp[cidx]
|
|
1037
|
+
except Exception:
|
|
1038
|
+
raw_min_disp = None
|
|
1039
|
+
try:
|
|
1040
|
+
raw_max_disp = xv_disp[cidx]
|
|
1041
|
+
except Exception:
|
|
1042
|
+
raw_max_disp = None
|
|
1043
|
+
|
|
1044
|
+
def _decode_display(v):
|
|
1045
|
+
if v is None:
|
|
1046
|
+
return None
|
|
1047
|
+
try:
|
|
1048
|
+
if isinstance(v, (bytes, bytearray, memoryview)):
|
|
1049
|
+
b = bytes(v)
|
|
1050
|
+
if b and b[-1] == 0xFF:
|
|
1051
|
+
b = b[:-1]
|
|
1052
|
+
return b.decode("utf-8", errors="replace")
|
|
1053
|
+
if isinstance(v, str):
|
|
1054
|
+
return v
|
|
1055
|
+
except Exception:
|
|
1056
|
+
return None
|
|
1057
|
+
return None
|
|
1058
|
+
|
|
1059
|
+
md = _decode_display(raw_min_disp)
|
|
1060
|
+
xd = _decode_display(raw_max_disp)
|
|
1061
|
+
if md is not None:
|
|
1062
|
+
stats[cname]["min_displays"].append(md)
|
|
1063
|
+
if xd is not None:
|
|
1064
|
+
stats[cname]["max_displays"].append(xd)
|
|
1065
|
+
except Exception:
|
|
1066
|
+
pass
|
|
1067
|
+
|
|
1068
|
+
# min-k hashes
|
|
1069
|
+
try:
|
|
1070
|
+
col_mk = mks[cidx] or []
|
|
1071
|
+
except Exception:
|
|
1072
|
+
col_mk = []
|
|
1073
|
+
for h in col_mk:
|
|
1074
|
+
try:
|
|
1075
|
+
stats[cname]["hashes"].add(int(h))
|
|
1076
|
+
except Exception:
|
|
1077
|
+
pass
|
|
1078
|
+
|
|
1079
|
+
# histograms
|
|
1080
|
+
try:
|
|
1081
|
+
col_hist = hists[cidx]
|
|
1082
|
+
except Exception:
|
|
1083
|
+
col_hist = []
|
|
1084
|
+
if col_hist:
|
|
1085
|
+
try:
|
|
1086
|
+
if dmin is not None and dmax is not None and dmin != dmax:
|
|
1087
|
+
stats[cname]["file_hist_infos"].append(
|
|
1088
|
+
(float(dmin), float(dmax), list(col_hist))
|
|
1089
|
+
)
|
|
1090
|
+
except Exception:
|
|
1091
|
+
pass
|
|
1092
|
+
|
|
1093
|
+
# uncompressed bytes for this column (sum across files)
|
|
1094
|
+
try:
|
|
1095
|
+
stats[cname]["uncompressed_bytes"] += int((col_sizes or [0])[cidx])
|
|
1096
|
+
except Exception:
|
|
1097
|
+
pass
|
|
1098
|
+
|
|
1099
|
+
# Build results per column
|
|
1100
|
+
results: dict[str, dict] = {}
|
|
1101
|
+
for cname, cidx in col_to_idx.items():
|
|
1102
|
+
s = stats[cname]
|
|
1103
|
+
# Handle mixed types: separate strings from numbers
|
|
1104
|
+
mins_filtered = [v for v in s["mins"] if v is not None]
|
|
1105
|
+
maxs_filtered = [v for v in s["maxs"] if v is not None]
|
|
1106
|
+
|
|
1107
|
+
# Group by type: strings vs numbers
|
|
1108
|
+
str_mins = [v for v in mins_filtered if isinstance(v, str)]
|
|
1109
|
+
num_mins = [v for v in mins_filtered if not isinstance(v, str)]
|
|
1110
|
+
str_maxs = [v for v in maxs_filtered if isinstance(v, str)]
|
|
1111
|
+
num_maxs = [v for v in maxs_filtered if not isinstance(v, str)]
|
|
1112
|
+
|
|
1113
|
+
# Use whichever type has values (strings take precedence for text columns)
|
|
1114
|
+
global_min = None
|
|
1115
|
+
global_max = None
|
|
1116
|
+
if str_mins:
|
|
1117
|
+
global_min = min(str_mins)
|
|
1118
|
+
elif num_mins:
|
|
1119
|
+
global_min = min(num_mins)
|
|
1120
|
+
|
|
1121
|
+
if str_maxs:
|
|
1122
|
+
global_max = max(str_maxs)
|
|
1123
|
+
elif num_maxs:
|
|
1124
|
+
global_max = max(num_maxs)
|
|
1125
|
+
|
|
1126
|
+
# kmv approx
|
|
1127
|
+
cardinality = 0
|
|
1128
|
+
cardinality_is_exact = False
|
|
1129
|
+
try:
|
|
1130
|
+
collected = s["hashes"]
|
|
1131
|
+
if collected:
|
|
1132
|
+
smallest = heapq.nsmallest(32, collected)
|
|
1133
|
+
k = len(smallest)
|
|
1134
|
+
if k < 31:
|
|
1135
|
+
cardinality = len(set(smallest))
|
|
1136
|
+
cardinality_is_exact = True
|
|
1137
|
+
else:
|
|
1138
|
+
MAX_HASH = (1 << 64) - 1
|
|
1139
|
+
R = max(smallest)
|
|
1140
|
+
if R == 0:
|
|
1141
|
+
cardinality = len(set(smallest))
|
|
1142
|
+
else:
|
|
1143
|
+
cardinality = int((k - 1) * (MAX_HASH + 1) / (R + 1))
|
|
1144
|
+
except Exception:
|
|
1145
|
+
cardinality = 0
|
|
1146
|
+
|
|
1147
|
+
# distribution via distogram
|
|
1148
|
+
distribution = None
|
|
1149
|
+
if (
|
|
1150
|
+
s["file_hist_infos"]
|
|
1151
|
+
and global_min is not None
|
|
1152
|
+
and global_max is not None
|
|
1153
|
+
and global_max > global_min
|
|
1154
|
+
):
|
|
1155
|
+
try:
|
|
1156
|
+
from opteryx_catalog.maki_nage.distogram import Distogram
|
|
1157
|
+
from opteryx_catalog.maki_nage.distogram import count as _count_dist
|
|
1158
|
+
from opteryx_catalog.maki_nage.distogram import count_up_to as _count_up_to
|
|
1159
|
+
from opteryx_catalog.maki_nage.distogram import merge as _merge_distogram
|
|
1160
|
+
from opteryx_catalog.maki_nage.distogram import update as _update_distogram
|
|
1161
|
+
|
|
1162
|
+
dist_bin_count = max(50, bins * 5)
|
|
1163
|
+
global_d = Distogram(bin_count=dist_bin_count)
|
|
1164
|
+
for fmin, fmax, counts in s["file_hist_infos"]:
|
|
1165
|
+
fbins = len(counts)
|
|
1166
|
+
if fbins <= 0:
|
|
1167
|
+
continue
|
|
1168
|
+
temp = Distogram(bin_count=dist_bin_count)
|
|
1169
|
+
span = float(fmax - fmin) if fmax != fmin else 0.0
|
|
1170
|
+
for bi, cnt in enumerate(counts):
|
|
1171
|
+
if cnt <= 0:
|
|
1172
|
+
continue
|
|
1173
|
+
if span == 0.0:
|
|
1174
|
+
rep = float(fmin)
|
|
1175
|
+
else:
|
|
1176
|
+
rep = fmin + (bi + 0.5) * span / fbins
|
|
1177
|
+
_update_distogram(temp, float(rep), int(cnt))
|
|
1178
|
+
global_d = _merge_distogram(global_d, temp)
|
|
1179
|
+
|
|
1180
|
+
distribution = [0] * bins
|
|
1181
|
+
total = int(_count_dist(global_d) or 0)
|
|
1182
|
+
if total == 0:
|
|
1183
|
+
distribution = [0] * bins
|
|
1184
|
+
else:
|
|
1185
|
+
prev = 0.0
|
|
1186
|
+
gmin = float(global_min)
|
|
1187
|
+
gmax = float(global_max)
|
|
1188
|
+
for i in range(1, bins + 1):
|
|
1189
|
+
edge = gmin + (i / bins) * (gmax - gmin)
|
|
1190
|
+
cum = _count_up_to(global_d, edge) or 0.0
|
|
1191
|
+
distribution[i - 1] = int(round(cum - prev))
|
|
1192
|
+
prev = cum
|
|
1193
|
+
diff = total - sum(distribution)
|
|
1194
|
+
if diff != 0:
|
|
1195
|
+
distribution[-1] += diff
|
|
1196
|
+
except Exception:
|
|
1197
|
+
distribution = [0] * bins
|
|
1198
|
+
gspan = float(global_max - global_min)
|
|
1199
|
+
for fmin, fmax, counts in s["file_hist_infos"]:
|
|
1200
|
+
fbins = len(counts)
|
|
1201
|
+
if fbins <= 0:
|
|
1202
|
+
continue
|
|
1203
|
+
for bi, cnt in enumerate(counts):
|
|
1204
|
+
if cnt <= 0:
|
|
1205
|
+
continue
|
|
1206
|
+
rep = fmin + (bi + 0.5) * (fmax - fmin) / fbins
|
|
1207
|
+
gi = int((rep - global_min) / gspan * bins)
|
|
1208
|
+
if gi < 0:
|
|
1209
|
+
gi = 0
|
|
1210
|
+
if gi >= bins:
|
|
1211
|
+
gi = bins - 1
|
|
1212
|
+
distribution[gi] += int(cnt)
|
|
1213
|
+
|
|
1214
|
+
res = {
|
|
1215
|
+
"dataset": self.identifier,
|
|
1216
|
+
"description": getattr(self.metadata, "description", None),
|
|
1217
|
+
"row_count": total_rows,
|
|
1218
|
+
"column": cname,
|
|
1219
|
+
"min": global_min,
|
|
1220
|
+
"max": global_max,
|
|
1221
|
+
"null_count": s["null_count"],
|
|
1222
|
+
"uncompressed_bytes": s["uncompressed_bytes"],
|
|
1223
|
+
"cardinality": cardinality,
|
|
1224
|
+
"cardinality_is_exact": cardinality_is_exact,
|
|
1225
|
+
"distribution": distribution,
|
|
1226
|
+
}
|
|
1227
|
+
|
|
1228
|
+
# If textual, attempt display prefixes like describe()
|
|
1229
|
+
try:
|
|
1230
|
+
is_text = False
|
|
1231
|
+
if orso_schema is not None:
|
|
1232
|
+
col = orso_schema.columns[cidx]
|
|
1233
|
+
ctype = getattr(col, "type", None)
|
|
1234
|
+
if ctype is not None:
|
|
1235
|
+
sctype = str(ctype).lower()
|
|
1236
|
+
if "char" in sctype or "string" in sctype or "varchar" in sctype:
|
|
1237
|
+
is_text = True
|
|
1238
|
+
except Exception:
|
|
1239
|
+
is_text = False
|
|
1240
|
+
|
|
1241
|
+
if is_text:
|
|
1242
|
+
# Use only textual display values collected from manifests.
|
|
1243
|
+
# Decode bytes and strip truncation marker (0xFF) if present.
|
|
1244
|
+
def _decode_display_raw(v):
|
|
1245
|
+
if v is None:
|
|
1246
|
+
return None
|
|
1247
|
+
try:
|
|
1248
|
+
if isinstance(v, (bytes, bytearray, memoryview)):
|
|
1249
|
+
b = bytes(v)
|
|
1250
|
+
if b and b[-1] == 0xFF:
|
|
1251
|
+
b = b[:-1]
|
|
1252
|
+
s_val = b.decode("utf-8", errors="replace")
|
|
1253
|
+
return s_val[:16]
|
|
1254
|
+
if isinstance(v, str):
|
|
1255
|
+
return v[:16]
|
|
1256
|
+
except Exception:
|
|
1257
|
+
return None
|
|
1258
|
+
return None
|
|
1259
|
+
|
|
1260
|
+
min_disp = None
|
|
1261
|
+
max_disp = None
|
|
1262
|
+
try:
|
|
1263
|
+
if s.get("min_displays"):
|
|
1264
|
+
for v in s.get("min_displays"):
|
|
1265
|
+
dv = _decode_display_raw(v)
|
|
1266
|
+
if dv:
|
|
1267
|
+
min_disp = dv
|
|
1268
|
+
break
|
|
1269
|
+
if s.get("max_displays"):
|
|
1270
|
+
for v in s.get("max_displays"):
|
|
1271
|
+
dv = _decode_display_raw(v)
|
|
1272
|
+
if dv:
|
|
1273
|
+
max_disp = dv
|
|
1274
|
+
break
|
|
1275
|
+
except Exception:
|
|
1276
|
+
min_disp = None
|
|
1277
|
+
max_disp = None
|
|
1278
|
+
|
|
1279
|
+
if min_disp is not None or max_disp is not None:
|
|
1280
|
+
res["min_display"] = min_disp
|
|
1281
|
+
res["max_display"] = max_disp
|
|
1282
|
+
|
|
1283
|
+
results[cname] = res
|
|
1284
|
+
|
|
1285
|
+
return results
|
|
1286
|
+
|
|
950
1287
|
def refresh_manifest(self, agent: str, author: Optional[str] = None) -> Optional[int]:
|
|
951
1288
|
"""Refresh manifest statistics and create a new snapshot.
|
|
952
1289
|
|
|
@@ -979,8 +1316,9 @@ class SimpleDataset(Dataset):
|
|
|
979
1316
|
import pyarrow as pa
|
|
980
1317
|
import pyarrow.parquet as pq
|
|
981
1318
|
|
|
982
|
-
|
|
983
|
-
|
|
1319
|
+
# the manifest is a parquet file, read into a pyarrow Table
|
|
1320
|
+
prev_manifest = pq.read_table(pa.BufferReader(prev_data))
|
|
1321
|
+
prev_rows = prev_manifest.to_pylist()
|
|
984
1322
|
except Exception:
|
|
985
1323
|
prev_rows = []
|
|
986
1324
|
|
|
@@ -1000,8 +1338,8 @@ class SimpleDataset(Dataset):
|
|
|
1000
1338
|
with inp.open() as f:
|
|
1001
1339
|
data = f.read()
|
|
1002
1340
|
# Full statistics including histograms and k-hashes
|
|
1003
|
-
|
|
1004
|
-
manifest_entry =
|
|
1341
|
+
file_size = len(data)
|
|
1342
|
+
manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
|
|
1005
1343
|
dent = manifest_entry.to_dict()
|
|
1006
1344
|
except Exception:
|
|
1007
1345
|
# Fall back to original entry if re-read fails
|