opteryx-catalog 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opteryx-catalog might be problematic. Click here for more details.

@@ -0,0 +1,289 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from dataclasses import field
5
+ from typing import Any
6
+ from typing import Dict
7
+
8
+ NULL_FLAG = -(1 << 63)
9
+ MIN_K_HASHES = 32
10
+ HISTOGRAM_BINS = 32
11
+
12
+
13
+ @dataclass
14
+ class DataFile:
15
+ file_path: str
16
+ file_format: str = "PARQUET"
17
+ record_count: int = 0
18
+ file_size_in_bytes: int = 0
19
+ partition: Dict[str, object] = field(default_factory=dict)
20
+ lower_bounds: Dict[int, bytes] | None = None
21
+ upper_bounds: Dict[int, bytes] | None = None
22
+
23
+
24
+ @dataclass
25
+ class ManifestEntry:
26
+ snapshot_id: int
27
+ data_file: DataFile
28
+ status: str = "added" # 'added' | 'deleted'
29
+
30
+
31
+ @dataclass
32
+ class ParquetManifestEntry:
33
+ """Represents a single entry in a Parquet manifest with statistics."""
34
+
35
+ file_path: str
36
+ file_format: str
37
+ record_count: int
38
+ file_size_in_bytes: int
39
+ uncompressed_size_in_bytes: int
40
+ min_k_hashes: list[list[int]]
41
+ histogram_counts: list[list[int]]
42
+ histogram_bins: int
43
+ min_values: list
44
+ max_values: list
45
+
46
+ def to_dict(self) -> dict:
47
+ return {
48
+ "file_path": self.file_path,
49
+ "file_format": self.file_format,
50
+ "record_count": self.record_count,
51
+ "file_size_in_bytes": self.file_size_in_bytes,
52
+ "uncompressed_size_in_bytes": self.uncompressed_size_in_bytes,
53
+ "min_k_hashes": self.min_k_hashes,
54
+ "histogram_counts": self.histogram_counts,
55
+ "histogram_bins": self.histogram_bins,
56
+ "min_values": self.min_values,
57
+ "max_values": self.max_values,
58
+ }
59
+
60
+
61
+ def build_parquet_manifest_entry(
62
+ table: Any, file_path: str, file_size_in_bytes: int
63
+ ) -> ParquetManifestEntry:
64
+ """Build a Parquet manifest entry with statistics for a PyArrow table.
65
+
66
+ Args:
67
+ table: PyArrow table to analyze
68
+ file_path: Path where the file is stored
69
+ file_size_in_bytes: Size of the parquet file in bytes
70
+
71
+ Returns:
72
+ ParquetManifestEntry with computed statistics
73
+ """
74
+ import pyarrow as pa
75
+
76
+ min_k_hashes: list[list[int]] = []
77
+ histograms: list[list[int]] = []
78
+ min_values: list[int] = []
79
+ max_values: list[int] = []
80
+
81
+ # Use draken for efficient hashing and compression when available.
82
+ import heapq
83
+
84
+ try:
85
+ import opteryx.draken as draken # type: ignore
86
+
87
+ for col_idx, col in enumerate(table.columns):
88
+ # hash column values to 64-bit via draken (new cpdef API)
89
+ vec = draken.Vector.from_arrow(col)
90
+ hashes = list(vec.hash())
91
+
92
+ # Decide whether to compute min-k/histogram for this column based
93
+ # on field type and, for strings, average length of values.
94
+ field_type = table.schema.field(col_idx).type
95
+ compute_min_k = False
96
+ if (
97
+ pa.types.is_integer(field_type)
98
+ or pa.types.is_floating(field_type)
99
+ or pa.types.is_decimal(field_type)
100
+ ):
101
+ compute_min_k = True
102
+ elif (
103
+ pa.types.is_timestamp(field_type)
104
+ or pa.types.is_date(field_type)
105
+ or pa.types.is_time(field_type)
106
+ ):
107
+ compute_min_k = True
108
+ elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
109
+ # compute average length from non-null values; only allow
110
+ # min-k/histogram for short strings (avg <= 16)
111
+ col_py = None
112
+ try:
113
+ col_py = col.to_pylist()
114
+ except Exception:
115
+ col_py = None
116
+
117
+ if col_py is not None:
118
+ lens = [len(x) for x in col_py if x is not None]
119
+ if lens:
120
+ avg_len = sum(lens) / len(lens)
121
+ if avg_len <= 16:
122
+ compute_min_k = True
123
+
124
+ # KMV: take K smallest hashes when allowed; otherwise store an
125
+ # empty list for this column.
126
+ if compute_min_k:
127
+ smallest = heapq.nsmallest(MIN_K_HASHES, hashes)
128
+ col_min_k = sorted(smallest)
129
+ else:
130
+ col_min_k = []
131
+
132
+ # For histogram decisions follow the same rule as min-k
133
+ compute_hist = compute_min_k
134
+
135
+ # Use draken.compress() to get canonical int64 per value
136
+ mapped = list(vec.compress())
137
+ non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
138
+ if non_nulls_mapped:
139
+ vmin = min(non_nulls_mapped)
140
+ vmax = max(non_nulls_mapped)
141
+ col_min = int(vmin)
142
+ col_max = int(vmax)
143
+ if compute_hist:
144
+ if vmin == vmax:
145
+ col_hist = [0] * HISTOGRAM_BINS
146
+ col_hist[-1] = len(non_nulls_mapped)
147
+ else:
148
+ col_hist = [0] * HISTOGRAM_BINS
149
+ span = float(vmax - vmin)
150
+ for m in non_nulls_mapped:
151
+ b = int(((float(m) - float(vmin)) / span) * (HISTOGRAM_BINS - 1))
152
+ if b < 0:
153
+ b = 0
154
+ if b >= HISTOGRAM_BINS:
155
+ b = HISTOGRAM_BINS - 1
156
+ col_hist[b] += 1
157
+ else:
158
+ col_hist = [0] * HISTOGRAM_BINS
159
+ else:
160
+ # no non-null values; histogram via hash buckets
161
+ col_min = NULL_FLAG
162
+ col_max = NULL_FLAG
163
+ if compute_hist:
164
+ col_hist = [0] * HISTOGRAM_BINS
165
+ for h in hashes:
166
+ b = (h >> (64 - 5)) & 0x1F
167
+ col_hist[b] += 1
168
+ else:
169
+ col_hist = [0] * HISTOGRAM_BINS
170
+
171
+ min_k_hashes.append(col_min_k)
172
+ histograms.append(col_hist)
173
+ min_values.append(col_min)
174
+ max_values.append(col_max)
175
+ except Exception as exc:
176
+ print(f"Failed to build full manifest entry: {file_path} - {exc}")
177
+ return build_parquet_manifest_minmax_entry(table, file_path)
178
+
179
+ # Calculate uncompressed size from table
180
+ uncompressed_size = 0
181
+ try:
182
+ for col in table.columns:
183
+ # Estimate uncompressed size by summing the size of all columns in memory
184
+ for chunk in col.chunks:
185
+ for buffer in chunk.buffers():
186
+ if buffer is not None:
187
+ uncompressed_size += buffer.size
188
+ except Exception:
189
+ # Fallback: use compressed size if calculation fails
190
+ uncompressed_size = file_size_in_bytes
191
+
192
+ return ParquetManifestEntry(
193
+ file_path=file_path,
194
+ file_format="parquet",
195
+ record_count=int(table.num_rows),
196
+ file_size_in_bytes=file_size_in_bytes,
197
+ uncompressed_size_in_bytes=uncompressed_size,
198
+ min_k_hashes=min_k_hashes,
199
+ histogram_counts=histograms,
200
+ histogram_bins=HISTOGRAM_BINS,
201
+ min_values=min_values,
202
+ max_values=max_values,
203
+ )
204
+
205
+
206
+ def build_parquet_manifest_minmax_entry(data: bytes, file_path: str) -> ParquetManifestEntry:
207
+ """Build a Parquet manifest entry with min/max statistics using fast rugo reader.
208
+
209
+ This is much faster than build_parquet_manifest_entry (microseconds per file)
210
+ and is suitable for bulk file operations where full statistics are not needed.
211
+
212
+ Args:
213
+ data: Raw parquet file bytes
214
+ file_path: Path where the file is stored
215
+
216
+ Returns:
217
+ ParquetManifestEntry with min/max statistics only (no histograms or k-hashes)
218
+ """
219
+ import opteryx.rugo.parquet as parquet_meta
220
+ from opteryx.compiled.structures.relation_statistics import to_int
221
+
222
+ file_size = len(data)
223
+
224
+ # Use rugo's fast metadata reader
225
+ if isinstance(data, memoryview):
226
+ metadata = parquet_meta.read_metadata_from_memoryview(data, include_statistics=True)
227
+ else:
228
+ metadata = parquet_meta.read_metadata_from_memoryview(
229
+ memoryview(data), include_statistics=True
230
+ )
231
+
232
+ record_count = metadata["num_rows"]
233
+
234
+ # Gather min/max per column across all row groups
235
+ column_stats = {}
236
+ for row_group in metadata["row_groups"]:
237
+ for column in row_group["columns"]:
238
+ column_name = column["name"]
239
+
240
+ if column_name not in column_stats:
241
+ column_stats[column_name] = {"min": None, "max": None}
242
+
243
+ min_value = column.get("min")
244
+ if min_value is not None:
245
+ # Compress value to int using to_int
246
+ min_compressed = to_int(min_value)
247
+ if column_stats[column_name]["min"] is None:
248
+ column_stats[column_name]["min"] = min_compressed
249
+ else:
250
+ column_stats[column_name]["min"] = min(
251
+ column_stats[column_name]["min"], min_compressed
252
+ )
253
+
254
+ max_value = column.get("max")
255
+ if max_value is not None:
256
+ # Compress value to int using to_int
257
+ max_compressed = to_int(max_value)
258
+ if column_stats[column_name]["max"] is None:
259
+ column_stats[column_name]["max"] = max_compressed
260
+ else:
261
+ column_stats[column_name]["max"] = max(
262
+ column_stats[column_name]["max"], max_compressed
263
+ )
264
+
265
+ # Extract min/max values (filter out None)
266
+ min_values = [stats["min"] for stats in column_stats.values() if stats["min"] is not None]
267
+ max_values = [stats["max"] for stats in column_stats.values() if stats["max"] is not None]
268
+
269
+ # Get uncompressed size from metadata
270
+ uncompressed_size = 0
271
+ try:
272
+ for row_group in metadata["row_groups"]:
273
+ uncompressed_size += row_group.get("total_byte_size", 0)
274
+ except Exception:
275
+ # Fallback to compressed size
276
+ uncompressed_size = file_size
277
+
278
+ return ParquetManifestEntry(
279
+ file_path=file_path,
280
+ file_format="parquet",
281
+ record_count=int(record_count),
282
+ file_size_in_bytes=file_size,
283
+ uncompressed_size_in_bytes=uncompressed_size,
284
+ min_k_hashes=[],
285
+ histogram_counts=[],
286
+ histogram_bins=0,
287
+ min_values=min_values,
288
+ max_values=max_values,
289
+ )
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from dataclasses import field
5
+ from typing import Any
6
+ from typing import List
7
+ from typing import Optional
8
+
9
+
10
+ @dataclass
11
+ class Snapshot:
12
+ snapshot_id: int
13
+ timestamp_ms: int
14
+ author: Optional[str] = None
15
+ # Indicates whether this snapshot was created by a user (True) or internally (False)
16
+ user_created: Optional[bool] = None
17
+ # Monotonic sequence number for writes
18
+ sequence_number: Optional[int] = None
19
+ manifest_list: Optional[str] = None
20
+ # Operation metadata
21
+ operation_type: Optional[str] = None # e.g., 'append', 'overwrite', 'compact'
22
+ parent_snapshot_id: Optional[int] = None
23
+ schema_id: Optional[str] = None
24
+ # Commit message for the snapshot
25
+ commit_message: Optional[str] = None
26
+ # Summary metrics (store zeros when not applicable)
27
+ summary: dict = field(
28
+ default_factory=lambda: {
29
+ "added-data-files": 0,
30
+ "added-files-size": 0,
31
+ "added-records": 0,
32
+ "deleted-data-files": 0,
33
+ "deleted-files-size": 0,
34
+ "deleted-records": 0,
35
+ "total-data-files": 0,
36
+ "total-files-size": 0,
37
+ "total-records": 0,
38
+ }
39
+ )
40
+
41
+
42
+ @dataclass
43
+ class DatasetMetadata:
44
+ dataset_identifier: str
45
+ format_version: int = 2
46
+ location: str = ""
47
+ schema: Any = None
48
+ properties: dict = field(default_factory=dict)
49
+ # Dataset-level created/updated metadata
50
+ timestamp_ms: Optional[int] = None
51
+ author: Optional[str] = None
52
+ description: Optional[str] = None
53
+ describer: Optional[str] = None
54
+ sort_orders: List[int] = field(default_factory=list)
55
+ # Maintenance policy: retention settings grouped under a single block
56
+ maintenance_policy: dict = field(
57
+ default_factory=lambda: {
58
+ "retained-snapshot-count": None,
59
+ "retained-snapshot-age-days": None,
60
+ "compaction-policy": "performance",
61
+ }
62
+ )
63
+ # Compaction policy lives under maintenance_policy as 'compaction-policy'
64
+ snapshots: List[Snapshot] = field(default_factory=list)
65
+ current_snapshot_id: Optional[int] = None
66
+ # Schema management: schemas are stored in a subcollection in Firestore.
67
+ # `schemas` contains dicts with keys: schema_id, columns (list of {id,name,type}).
68
+ # Each schema dict may also include `timestamp-ms` and `author`.
69
+ schemas: List[dict] = field(default_factory=list)
70
+ current_schema_id: Optional[str] = None
71
+
72
+ def current_snapshot(self) -> Optional[Snapshot]:
73
+ if self.current_snapshot_id is None:
74
+ return self.snapshots[-1] if self.snapshots else None
75
+ for s in self.snapshots:
76
+ if s.snapshot_id == self.current_snapshot_id:
77
+ return s
78
+ return None
79
+
80
+
81
+ # Dataset terminology: TableMetadata renamed to DatasetMetadata
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+ from typing import Iterable
5
+ from typing import Optional
6
+
7
+
8
+ class Metastore:
9
+ """Abstract catalog interface.
10
+
11
+ Implementations should provide methods to create, load and manage
12
+ datasets and views. Terminology in this project follows the mapping:
13
+ `catalog -> workspace -> collection -> dataset|view`.
14
+ Signatures are intentionally simple and similar to other catalog
15
+ implementations to ease future compatibility.
16
+ """
17
+
18
+ def load_dataset(self, identifier: str) -> "Dataset":
19
+ raise NotImplementedError()
20
+
21
+ def create_dataset(
22
+ self, identifier: str, schema: Any, properties: dict | None = None
23
+ ) -> "Dataset":
24
+ raise NotImplementedError()
25
+
26
+ def drop_dataset(self, identifier: str) -> None:
27
+ raise NotImplementedError()
28
+
29
+ def list_datasets(self, namespace: str) -> Iterable[str]:
30
+ raise NotImplementedError()
31
+
32
+
33
+ class Dataset:
34
+ """Abstract dataset interface.
35
+
36
+ Minimal methods needed by the Opteryx engine and tests: access metadata,
37
+ list snapshots, append data, and produce a data scan object.
38
+ """
39
+
40
+ @property
41
+ def metadata(self) -> Any:
42
+ raise NotImplementedError()
43
+
44
+ def snapshots(self) -> Iterable[Any]:
45
+ raise NotImplementedError()
46
+
47
+ def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Any]:
48
+ """Return a specific snapshot by id or the current snapshot when
49
+ called with `snapshot_id=None`.
50
+ """
51
+ raise NotImplementedError()
52
+
53
+ def append(self, table):
54
+ """Append data (implementations can accept pyarrow.Table or similar)."""
55
+ raise NotImplementedError()
56
+
57
+ def scan(
58
+ self, row_filter=None, snapshot_id: Optional[int] = None, row_limit: Optional[int] = None
59
+ ) -> Any:
60
+ raise NotImplementedError()
61
+
62
+
63
+ class View:
64
+ """Abstract view metadata representation."""
65
+
66
+ @property
67
+ def definition(self) -> str:
68
+ raise NotImplementedError()
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+
7
+ @dataclass
8
+ class View:
9
+ name: str
10
+ definition: str
11
+ properties: dict | None = None
12
+ metadata: Any | None = None
@@ -0,0 +1,38 @@
1
+ """Catalog-specific exceptions for opteryx_catalog.
2
+
3
+ Exceptions mirror previous behavior (they subclass KeyError where callers
4
+ may expect KeyError) but provide explicit types for datasets, views and
5
+ namespaces.
6
+ """
7
+
8
+
9
+ class CatalogError(Exception):
10
+ """Base class for catalog errors."""
11
+
12
+
13
+ class DatasetError(KeyError, CatalogError):
14
+ pass
15
+
16
+
17
+ class DatasetAlreadyExists(DatasetError):
18
+ pass
19
+
20
+
21
+ class DatasetNotFound(DatasetError):
22
+ pass
23
+
24
+
25
+ class ViewError(KeyError, CatalogError):
26
+ pass
27
+
28
+
29
+ class ViewAlreadyExists(ViewError):
30
+ pass
31
+
32
+
33
+ class ViewNotFound(ViewError):
34
+ pass
35
+
36
+
37
+ class CollectionAlreadyExists(KeyError, CatalogError):
38
+ pass
@@ -0,0 +1,6 @@
1
+ from .base import FileIO
2
+ from .base import InputFile
3
+ from .base import OutputFile
4
+ from .gcs import GcsFileIO
5
+
6
+ __all__ = ["FileIO", "InputFile", "OutputFile", "GcsFileIO"]
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ from io import BytesIO
4
+ from typing import BinaryIO
5
+
6
+
7
+ class InputFile:
8
+ def __init__(self, location: str, content: bytes | None = None):
9
+ self.location = location
10
+ self._content = content
11
+
12
+ def open(self) -> BinaryIO:
13
+ if self._content is None:
14
+ raise FileNotFoundError(self.location)
15
+ return BytesIO(self._content)
16
+
17
+
18
+ class OutputFile:
19
+ def __init__(self, location: str):
20
+ self.location = location
21
+
22
+ def create(self):
23
+ """Return a file-like object with a `write` method.
24
+
25
+ Implementations may return a buffer or a writer that persists on write/close.
26
+ """
27
+ raise NotImplementedError()
28
+
29
+
30
+ class FileIO:
31
+ """Minimal FileIO abstraction used by the `opteryx_catalog` layer.
32
+
33
+ Concrete implementations should implement `new_input`, `new_output`, and
34
+ optionally `delete`/`exists`. The abstraction intentionally keeps only the
35
+ small surface needed by the catalog (read bytes, write bytes).
36
+ """
37
+
38
+ def new_input(self, location: str) -> InputFile:
39
+ return InputFile(location)
40
+
41
+ def new_output(self, location: str) -> OutputFile:
42
+ return OutputFile(location)
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ from io import BytesIO
4
+ from typing import BinaryIO
5
+
6
+
7
+ class InputFile:
8
+ def __init__(self, location: str, content: bytes | None = None):
9
+ self.location = location
10
+ self._content = content
11
+
12
+ def open(self) -> BinaryIO:
13
+ if self._content is None:
14
+ raise FileNotFoundError(self.location)
15
+ return BytesIO(self._content)
16
+
17
+
18
+ class OutputFile:
19
+ def __init__(self, location: str):
20
+ self.location = location
21
+
22
+ def create(self):
23
+ """Return a file-like object with a `write` method.
24
+
25
+ Implementations may return a buffer or a writer that persists on write/close.
26
+ """
27
+ raise NotImplementedError()
28
+
29
+
30
+ class FileIO:
31
+ """Minimal FileIO abstraction used by the `opteryx_catalog` layer.
32
+
33
+ Concrete implementations should implement `new_input`, `new_output`, and
34
+ optionally `delete`/`exists`. The abstraction intentionally keeps only the
35
+ small surface needed by the catalog (read bytes, write bytes).
36
+ """
37
+
38
+ def new_input(self, location: str) -> InputFile:
39
+ return InputFile(location)
40
+
41
+ def new_output(self, location: str) -> OutputFile:
42
+ return OutputFile(location)
43
+
44
+
45
+ class _GcsAdapterOutputFile(OutputFile):
46
+ def __init__(self, location: str, gcs_fileio):
47
+ super().__init__(location)
48
+ self._location = location
49
+ self._gcs_fileio = gcs_fileio
50
+
51
+ def create(self):
52
+ """Return a writer whose `write(data)` uploads the data via the wrapped GCS FileIO.
53
+
54
+ We perform the upload on the first write and close the underlying stream
55
+ afterwards so callers that simply call `out.write(data)` (common pattern
56
+ in this codebase) will succeed.
57
+ """
58
+
59
+ class _Writer:
60
+ def __init__(self, location: str, gcs_fileio):
61
+ self._location = location
62
+ self._gcs_fileio = gcs_fileio
63
+ self._stream = None
64
+
65
+ def write(self, data: bytes | bytearray):
66
+ if self._stream is None:
67
+ # Create underlying output stream (may be a GcsOutputStream,
68
+ # DiscardOutputStream, or CaptureOutputStream depending on
69
+ # the wrapped FileIO behaviour).
70
+ out = self._gcs_fileio.new_output(self._location)
71
+ self._stream = out.create()
72
+ # Underlying stream implements write/close semantics
73
+ self._stream.write(data)
74
+
75
+ def close(self):
76
+ if self._stream is not None:
77
+ try:
78
+ self._stream.close()
79
+ except Exception:
80
+ pass
81
+
82
+ return _Writer(self._location, self._gcs_fileio)
83
+
84
+
85
+ class GcsFileIO(FileIO):
86
+ """GCS-backed FileIO adapter that wraps the existing GCS implementation.
87
+
88
+ This adapter delegates to `pyiceberg_firestore_gcs.fileio.gcs_fileio.GcsFileIO`
89
+ for actual network operations but exposes the small `opteryx_catalog.iops`
90
+ `FileIO` interface used by the catalog layer.
91
+ """
92
+
93
+ def __init__(self, properties=None):
94
+ # Lazy import to avoid pulling google libs unless used
95
+ from pyiceberg_firestore_gcs.fileio.gcs_fileio import GcsFileIO as _GcsImpl
96
+
97
+ self._impl = _GcsImpl(properties or {})
98
+
99
+ def new_input(self, location: str) -> InputFile:
100
+ # Read full bytes from the underlying InputFile and return an in-memory InputFile
101
+ impl_input = self._impl.new_input(location)
102
+ try:
103
+ stream = impl_input.open()
104
+ data = stream.read()
105
+ return InputFile(location, data)
106
+ except FileNotFoundError:
107
+ return InputFile(location, None)
108
+
109
+ def new_output(self, location: str) -> OutputFile:
110
+ return _GcsAdapterOutputFile(location, self._impl)
111
+
112
+ def delete(self, location: str) -> None:
113
+ return self._impl.delete(location)
114
+
115
+ def exists(self, location: str) -> bool:
116
+ try:
117
+ impl_in = self._impl.new_input(location)
118
+ # Some implementations provide `exists()`
119
+ if hasattr(impl_in, "exists"):
120
+ return impl_in.exists()
121
+ # Fallback: try to open
122
+ _ = impl_in.open()
123
+ return True
124
+ except Exception:
125
+ return False