opteryx-catalog 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from dataclasses import field
5
+ from typing import Dict
6
+
7
+
8
+ @dataclass
9
+ class DataFile:
10
+ file_path: str
11
+ file_format: str = "PARQUET"
12
+ record_count: int = 0
13
+ file_size_in_bytes: int = 0
14
+ partition: Dict[str, object] = field(default_factory=dict)
15
+ lower_bounds: Dict[int, bytes] | None = None
16
+ upper_bounds: Dict[int, bytes] | None = None
17
+
18
+
19
+ @dataclass
20
+ class ManifestEntry:
21
+ snapshot_id: int
22
+ data_file: DataFile
23
+ status: str = "added" # 'added' | 'deleted'
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from dataclasses import field
5
+ from typing import Any
6
+ from typing import List
7
+ from typing import Optional
8
+
9
+
10
+ @dataclass
11
+ class Snapshot:
12
+ snapshot_id: int
13
+ timestamp_ms: int
14
+ author: Optional[str] = None
15
+ # Indicates whether this snapshot was created by a user (True) or internally (False)
16
+ user_created: Optional[bool] = None
17
+ # Monotonic sequence number for writes
18
+ sequence_number: Optional[int] = None
19
+ manifest_list: Optional[str] = None
20
+ # Operation metadata
21
+ operation_type: Optional[str] = None # e.g., 'append', 'overwrite', 'compact'
22
+ parent_snapshot_id: Optional[int] = None
23
+ schema_id: Optional[str] = None
24
+ # Commit message for the snapshot
25
+ commit_message: Optional[str] = None
26
+ # Summary metrics (store zeros when not applicable)
27
+ summary: dict = field(
28
+ default_factory=lambda: {
29
+ "added-data-files": 0,
30
+ "added-files-size": 0,
31
+ "added-records": 0,
32
+ "deleted-data-files": 0,
33
+ "deleted-files-size": 0,
34
+ "deleted-records": 0,
35
+ "total-data-files": 0,
36
+ "total-files-size": 0,
37
+ "total-records": 0,
38
+ }
39
+ )
40
+
41
+
42
+ @dataclass
43
+ class DatasetMetadata:
44
+ dataset_identifier: str
45
+ format_version: int = 2
46
+ location: str = ""
47
+ schema: Any = None
48
+ properties: dict = field(default_factory=dict)
49
+ # Table-level created/updated metadata
50
+ timestamp_ms: Optional[int] = None
51
+ author: Optional[str] = None
52
+ description: Optional[str] = None
53
+ describer: Optional[str] = None
54
+ sort_orders: List[dict] = field(default_factory=list)
55
+ # Maintenance policy: retention settings grouped under a single block
56
+ maintenance_policy: dict = field(
57
+ default_factory=lambda: {
58
+ "retained-snapshot-count": None,
59
+ "retained-snapshot-age-days": None,
60
+ "compaction-policy": "performance",
61
+ }
62
+ )
63
+ # Compaction policy lives under maintenance_policy as 'compaction-policy'
64
+ snapshots: List[Snapshot] = field(default_factory=list)
65
+ current_snapshot_id: Optional[int] = None
66
+ # Schema management: schemas are stored in a subcollection in Firestore.
67
+ # `schemas` contains dicts with keys: schema_id, columns (list of {id,name,type}).
68
+ # Each schema dict may also include `timestamp-ms` and `author`.
69
+ schemas: List[dict] = field(default_factory=list)
70
+ current_schema_id: Optional[str] = None
71
+
72
+ def current_snapshot(self) -> Optional[Snapshot]:
73
+ if self.current_snapshot_id is None:
74
+ return self.snapshots[-1] if self.snapshots else None
75
+ for s in self.snapshots:
76
+ if s.snapshot_id == self.current_snapshot_id:
77
+ return s
78
+ return None
79
+
80
+
81
+ # Dataset terminology: TableMetadata renamed to DatasetMetadata
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+ from typing import Iterable
5
+ from typing import Optional
6
+
7
+
8
+ class Metastore:
9
+ """Abstract catalog interface.
10
+
11
+ Implementations should provide methods to create, load and manage
12
+ datasets and views. Terminology in this project follows the mapping:
13
+ `catalog -> workspace -> collection -> dataset|view`.
14
+ Signatures are intentionally simple and similar to other catalog
15
+ implementations to ease future compatibility.
16
+ """
17
+
18
+ def load_dataset(self, identifier: str) -> "Table":
19
+ raise NotImplementedError()
20
+
21
+ def create_dataset(
22
+ self, identifier: str, schema: Any, properties: dict | None = None
23
+ ) -> "Table":
24
+ raise NotImplementedError()
25
+
26
+ def drop_dataset(self, identifier: str) -> None:
27
+ raise NotImplementedError()
28
+
29
+ def list_datasets(self, namespace: str) -> Iterable[str]:
30
+ raise NotImplementedError()
31
+
32
+
33
+ class Dataset:
34
+ """Abstract dataset interface.
35
+
36
+ Minimal methods needed by the Opteryx engine and tests: access metadata,
37
+ list snapshots, append data, and produce a data scan object.
38
+ """
39
+
40
+ @property
41
+ def metadata(self) -> Any:
42
+ raise NotImplementedError()
43
+
44
+ def snapshots(self) -> Iterable[Any]:
45
+ raise NotImplementedError()
46
+
47
+ def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Any]:
48
+ """Return a specific snapshot by id or the current snapshot when
49
+ called with `snapshot_id=None`.
50
+ """
51
+ raise NotImplementedError()
52
+
53
+ def append(self, table):
54
+ """Append data (implementations can accept pyarrow.Table or similar)."""
55
+ raise NotImplementedError()
56
+
57
+ def scan(
58
+ self, row_filter=None, snapshot_id: Optional[int] = None, row_limit: Optional[int] = None
59
+ ) -> Any:
60
+ raise NotImplementedError()
61
+
62
+
63
+ class View:
64
+ """Abstract view metadata representation."""
65
+
66
+ @property
67
+ def definition(self) -> str:
68
+ raise NotImplementedError()
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+
7
+ @dataclass
8
+ class View:
9
+ name: str
10
+ definition: str
11
+ properties: dict | None = None
12
+ metadata: Any | None = None
@@ -0,0 +1,38 @@
1
+ """Catalog-specific exceptions for opteryx_catalog.
2
+
3
+ Exceptions mirror previous behavior (they subclass KeyError where callers
4
+ may expect KeyError) but provide explicit types for tables, views and
5
+ namespaces.
6
+ """
7
+
8
+
9
+ class CatalogError(Exception):
10
+ """Base class for catalog errors."""
11
+
12
+
13
+ class DatasetError(KeyError, CatalogError):
14
+ pass
15
+
16
+
17
+ class DatasetAlreadyExists(DatasetError):
18
+ pass
19
+
20
+
21
+ class DatasetNotFound(DatasetError):
22
+ pass
23
+
24
+
25
+ class ViewError(KeyError, CatalogError):
26
+ pass
27
+
28
+
29
+ class ViewAlreadyExists(ViewError):
30
+ pass
31
+
32
+
33
+ class ViewNotFound(ViewError):
34
+ pass
35
+
36
+
37
+ class CollectionAlreadyExists(KeyError, CatalogError):
38
+ pass
@@ -0,0 +1,6 @@
1
+ from .base import FileIO
2
+ from .base import InputFile
3
+ from .base import OutputFile
4
+ from .gcs import GcsFileIO
5
+
6
+ __all__ = ["FileIO", "InputFile", "OutputFile", "GcsFileIO"]
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ from io import BytesIO
4
+ from typing import BinaryIO
5
+
6
+
7
+ class InputFile:
8
+ def __init__(self, location: str, content: bytes | None = None):
9
+ self.location = location
10
+ self._content = content
11
+
12
+ def open(self) -> BinaryIO:
13
+ if self._content is None:
14
+ raise FileNotFoundError(self.location)
15
+ return BytesIO(self._content)
16
+
17
+
18
+ class OutputFile:
19
+ def __init__(self, location: str):
20
+ self.location = location
21
+
22
+ def create(self):
23
+ """Return a file-like object with a `write` method.
24
+
25
+ Implementations may return a buffer or a writer that persists on write/close.
26
+ """
27
+ raise NotImplementedError()
28
+
29
+
30
+ class FileIO:
31
+ """Minimal FileIO abstraction used by the `opteryx_catalog` layer.
32
+
33
+ Concrete implementations should implement `new_input`, `new_output`, and
34
+ optionally `delete`/`exists`. The abstraction intentionally keeps only the
35
+ small surface needed by the catalog (read bytes, write bytes).
36
+ """
37
+
38
+ def new_input(self, location: str) -> InputFile:
39
+ return InputFile(location)
40
+
41
+ def new_output(self, location: str) -> OutputFile:
42
+ return OutputFile(location)
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ from io import BytesIO
4
+ from typing import BinaryIO
5
+
6
+
7
+ class InputFile:
8
+ def __init__(self, location: str, content: bytes | None = None):
9
+ self.location = location
10
+ self._content = content
11
+
12
+ def open(self) -> BinaryIO:
13
+ if self._content is None:
14
+ raise FileNotFoundError(self.location)
15
+ return BytesIO(self._content)
16
+
17
+
18
+ class OutputFile:
19
+ def __init__(self, location: str):
20
+ self.location = location
21
+
22
+ def create(self):
23
+ """Return a file-like object with a `write` method.
24
+
25
+ Implementations may return a buffer or a writer that persists on write/close.
26
+ """
27
+ raise NotImplementedError()
28
+
29
+
30
+ class FileIO:
31
+ """Minimal FileIO abstraction used by the `opteryx_catalog` layer.
32
+
33
+ Concrete implementations should implement `new_input`, `new_output`, and
34
+ optionally `delete`/`exists`. The abstraction intentionally keeps only the
35
+ small surface needed by the catalog (read bytes, write bytes).
36
+ """
37
+
38
+ def new_input(self, location: str) -> InputFile:
39
+ return InputFile(location)
40
+
41
+ def new_output(self, location: str) -> OutputFile:
42
+ return OutputFile(location)
43
+
44
+
45
+ class _GcsAdapterOutputFile(OutputFile):
46
+ def __init__(self, location: str, gcs_fileio):
47
+ super().__init__(location)
48
+ self._location = location
49
+ self._gcs_fileio = gcs_fileio
50
+
51
+ def create(self):
52
+ """Return a writer whose `write(data)` uploads the data via the wrapped GCS FileIO.
53
+
54
+ We perform the upload on the first write and close the underlying stream
55
+ afterwards so callers that simply call `out.write(data)` (common pattern
56
+ in this codebase) will succeed.
57
+ """
58
+
59
+ class _Writer:
60
+ def __init__(self, location: str, gcs_fileio):
61
+ self._location = location
62
+ self._gcs_fileio = gcs_fileio
63
+ self._stream = None
64
+
65
+ def write(self, data: bytes | bytearray):
66
+ if self._stream is None:
67
+ # Create underlying output stream (may be a GcsOutputStream,
68
+ # DiscardOutputStream, or CaptureOutputStream depending on
69
+ # the wrapped FileIO behaviour).
70
+ out = self._gcs_fileio.new_output(self._location)
71
+ self._stream = out.create()
72
+ # Underlying stream implements write/close semantics
73
+ self._stream.write(data)
74
+
75
+ def close(self):
76
+ if self._stream is not None:
77
+ try:
78
+ self._stream.close()
79
+ except Exception:
80
+ pass
81
+
82
+ return _Writer(self._location, self._gcs_fileio)
83
+
84
+
85
+ class GcsFileIO(FileIO):
86
+ """GCS-backed FileIO adapter that wraps the existing GCS implementation.
87
+
88
+ This adapter delegates to `pyiceberg_firestore_gcs.fileio.gcs_fileio.GcsFileIO`
89
+ for actual network operations but exposes the small `opteryx_catalog.iops`
90
+ `FileIO` interface used by the catalog layer.
91
+ """
92
+
93
+ def __init__(self, properties=None):
94
+ # Lazy import to avoid pulling google libs unless used
95
+ from pyiceberg_firestore_gcs.fileio.gcs_fileio import GcsFileIO as _GcsImpl
96
+
97
+ self._impl = _GcsImpl(properties or {})
98
+
99
+ def new_input(self, location: str) -> InputFile:
100
+ # Read full bytes from the underlying InputFile and return an in-memory InputFile
101
+ impl_input = self._impl.new_input(location)
102
+ try:
103
+ stream = impl_input.open()
104
+ data = stream.read()
105
+ return InputFile(location, data)
106
+ except FileNotFoundError:
107
+ return InputFile(location, None)
108
+
109
+ def new_output(self, location: str) -> OutputFile:
110
+ return _GcsAdapterOutputFile(location, self._impl)
111
+
112
+ def delete(self, location: str) -> None:
113
+ return self._impl.delete(location)
114
+
115
+ def exists(self, location: str) -> bool:
116
+ try:
117
+ impl_in = self._impl.new_input(location)
118
+ # Some implementations provide `exists()`
119
+ if hasattr(impl_in, "exists"):
120
+ return impl_in.exists()
121
+ # Fallback: try to open
122
+ _ = impl_in.open()
123
+ return True
124
+ except Exception:
125
+ return False
@@ -0,0 +1,225 @@
1
+ """
2
+ Optimized GCS FileIO for opteryx_catalog.iops
3
+
4
+ Adapted from pyiceberg_firestore_gcs.fileio.gcs_fileio to provide a fast
5
+ HTTP-backed GCS implementation without depending on pyiceberg types.
6
+ """
7
+
8
+ import io
9
+ import logging
10
+ import os
11
+ import urllib.parse
12
+ from typing import Callable
13
+ from typing import Union
14
+
15
+ import requests
16
+ from google.auth.transport.requests import Request
17
+ from requests.adapters import HTTPAdapter
18
+
19
+ from .base import FileIO
20
+ from .base import InputFile
21
+ from .base import OutputFile
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ def _get_storage_credentials():
27
+ from google.cloud import storage
28
+
29
+ if os.environ.get("STORAGE_EMULATOR_HOST"):
30
+ from google.auth.credentials import AnonymousCredentials
31
+
32
+ storage_client = storage.Client(credentials=AnonymousCredentials())
33
+ else:
34
+ storage_client = storage.Client()
35
+ return storage_client._credentials
36
+
37
+
38
+ class _GcsInputStream(io.BytesIO):
39
+ def __init__(
40
+ self, path: str, session: requests.Session, access_token_getter: Callable[[], str]
41
+ ):
42
+ # Strip gs://
43
+ if path.startswith("gs://"):
44
+ path = path[5:]
45
+ bucket = path.split("/", 1)[0]
46
+ object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
47
+ url = f"https://storage.googleapis.com/{bucket}/{object_full_path}"
48
+
49
+ access_token = access_token_getter()
50
+ headers = {"Accept-Encoding": "identity"}
51
+ if access_token:
52
+ headers["Authorization"] = f"Bearer {access_token}"
53
+
54
+ response = session.get(
55
+ url,
56
+ headers=headers,
57
+ timeout=30,
58
+ )
59
+
60
+ if response.status_code != 200:
61
+ raise FileNotFoundError(
62
+ f"Unable to read '{path}' - status {response.status_code}: {response.text}"
63
+ )
64
+
65
+ super().__init__(response.content)
66
+
67
+
68
+ class _GcsOutputStream(io.BytesIO):
69
+ def __init__(
70
+ self, path: str, session: requests.Session, access_token_getter: Callable[[], str]
71
+ ):
72
+ super().__init__()
73
+ self._path = path
74
+ self._session = session
75
+ self._access_token_getter = access_token_getter
76
+ self._closed = False
77
+
78
+ def close(self):
79
+ if self._closed:
80
+ return
81
+
82
+ path = self._path
83
+ if path.startswith("gs://"):
84
+ path = path[5:]
85
+
86
+ bucket = path.split("/", 1)[0]
87
+ url = f"https://storage.googleapis.com/upload/storage/v1/b/{bucket}/o"
88
+
89
+ data = self.getvalue()
90
+ object_name = path[(len(bucket) + 1) :]
91
+
92
+ token = self._access_token_getter()
93
+ headers = {
94
+ "Content-Type": "application/octet-stream",
95
+ "Content-Length": str(len(data)),
96
+ }
97
+ if token:
98
+ headers["Authorization"] = f"Bearer {token}"
99
+
100
+ response = self._session.post(
101
+ url,
102
+ params={"uploadType": "media", "name": object_name},
103
+ headers=headers,
104
+ data=data,
105
+ timeout=60,
106
+ )
107
+
108
+ if response.status_code not in (200, 201):
109
+ raise IOError(
110
+ f"Failed to write '{self._path}' - status {response.status_code}: {response.text}"
111
+ )
112
+
113
+ self._closed = True
114
+ super().close()
115
+
116
+
117
+ class _GcsInputFile(InputFile):
118
+ def __init__(
119
+ self, location: str, session: requests.Session, access_token_getter: Callable[[], str]
120
+ ):
121
+ # read entire bytes via optimized session
122
+ try:
123
+ stream = _GcsInputStream(location, session, access_token_getter)
124
+ data = stream.read()
125
+ super().__init__(location, data)
126
+ except FileNotFoundError:
127
+ super().__init__(location, None)
128
+
129
+
130
+ class _GcsOutputFile(OutputFile):
131
+ def __init__(
132
+ self, location: str, session: requests.Session, access_token_getter: Callable[[], str]
133
+ ):
134
+ super().__init__(location)
135
+ self._location = location
136
+ self._session = session
137
+ self._access_token_getter = access_token_getter
138
+
139
+ def create(self):
140
+ return _GcsOutputStream(self._location, self._session, self._access_token_getter)
141
+
142
+
143
+ class GcsFileIO(FileIO):
144
+ """Optimized HTTP-backed GCS FileIO.
145
+
146
+ Implements a blackhole/capture pattern for manifest files and exposes
147
+ `new_input`, `new_output`, `delete`, `exists`.
148
+ """
149
+
150
+ def __init__(self):
151
+ # Track manifest paths and captured manifests
152
+ self.manifest_paths: list[str] = []
153
+ self.captured_manifests: list[tuple[str, bytes]] = []
154
+
155
+ # Prepare requests session and set up credential refresh helper (token may expire)
156
+ self._credentials = _get_storage_credentials()
157
+ self._access_token = None
158
+
159
+ def _refresh_credentials():
160
+ try:
161
+ if not self._credentials.valid:
162
+ req = Request()
163
+ self._credentials.refresh(req)
164
+ self._access_token = self._credentials.token
165
+ except Exception as e:
166
+ logger.warning("Failed to refresh GCS credentials: %s", e)
167
+ self._access_token = None
168
+
169
+ self._refresh_credentials = _refresh_credentials
170
+
171
+ def get_access_token():
172
+ # Refresh credentials on demand to avoid using expired tokens
173
+ self._refresh_credentials()
174
+ return self._access_token
175
+
176
+ self.get_access_token = get_access_token
177
+
178
+ self._session = requests.session()
179
+ adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100)
180
+ self._session.mount("https://", adapter)
181
+
182
+ def new_input(self, location: str) -> InputFile:
183
+ return _GcsInputFile(location, self._session, self.get_access_token)
184
+
185
+ def new_output(self, location: str) -> OutputFile:
186
+ logger.info(f"new_output -> {location}")
187
+
188
+ return _GcsOutputFile(location, self._session, self.get_access_token)
189
+
190
+ def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
191
+ if isinstance(location, (InputFile, OutputFile)):
192
+ location = location.location
193
+
194
+ path = location
195
+ if path.startswith("gs://"):
196
+ path = path[5:]
197
+
198
+ bucket = path.split("/", 1)[0]
199
+ object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
200
+ url = f"https://storage.googleapis.com/storage/v1/b/{bucket}/o/{object_full_path}"
201
+
202
+ token = self.get_access_token()
203
+ headers = {}
204
+ if token:
205
+ headers["Authorization"] = f"Bearer {token}"
206
+ response = self._session.delete(url, headers=headers, timeout=10)
207
+
208
+ if response.status_code not in (204, 404):
209
+ raise IOError(f"Failed to delete '{location}' - status {response.status_code}")
210
+
211
+ def exists(self, location: str) -> bool:
212
+ path = location
213
+ if path.startswith("gs://"):
214
+ path = path[5:]
215
+
216
+ bucket = path.split("/", 1)[0]
217
+ object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
218
+ url = f"https://storage.googleapis.com/{bucket}/{object_full_path}"
219
+
220
+ token = self.get_access_token()
221
+ headers = {}
222
+ if token:
223
+ headers["Authorization"] = f"Bearer {token}"
224
+ response = self._session.head(url, headers=headers, timeout=10)
225
+ return response.status_code == 200