opteryx-catalog 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opteryx-catalog might be problematic. Click here for more details.
- opteryx_catalog/__init__.py +31 -0
- opteryx_catalog/catalog/__init__.py +4 -0
- opteryx_catalog/catalog/compaction.py +529 -0
- opteryx_catalog/catalog/dataset.py +1199 -0
- opteryx_catalog/catalog/manifest.py +289 -0
- opteryx_catalog/catalog/metadata.py +81 -0
- opteryx_catalog/catalog/metastore.py +68 -0
- opteryx_catalog/catalog/view.py +12 -0
- opteryx_catalog/exceptions.py +38 -0
- opteryx_catalog/iops/__init__.py +6 -0
- opteryx_catalog/iops/base.py +42 -0
- opteryx_catalog/iops/fileio.py +125 -0
- opteryx_catalog/iops/gcs.py +255 -0
- opteryx_catalog/opteryx_catalog.py +857 -0
- opteryx_catalog-0.4.8.dist-info/METADATA +464 -0
- opteryx_catalog-0.4.8.dist-info/RECORD +25 -0
- opteryx_catalog-0.4.8.dist-info/WHEEL +5 -0
- opteryx_catalog-0.4.8.dist-info/licenses/LICENSE +201 -0
- opteryx_catalog-0.4.8.dist-info/top_level.txt +3 -0
- scripts/create_dataset.py +201 -0
- scripts/read_dataset.py +268 -0
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +29 -0
- tests/test_import.py +5 -0
- tests/test_pyproject.py +8 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Optimized GCS FileIO for opteryx_catalog.iops
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import urllib.parse
|
|
9
|
+
from collections import OrderedDict
|
|
10
|
+
from typing import Callable
|
|
11
|
+
from typing import Union
|
|
12
|
+
|
|
13
|
+
import requests
|
|
14
|
+
from google.auth.transport.requests import Request
|
|
15
|
+
from requests.adapters import HTTPAdapter
|
|
16
|
+
|
|
17
|
+
from .base import FileIO
|
|
18
|
+
from .base import InputFile
|
|
19
|
+
from .base import OutputFile
|
|
20
|
+
|
|
21
|
+
# we keep a local cache of recently read files
|
|
22
|
+
MAX_CACHE_SIZE: int = 32
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _get_storage_credentials():
|
|
28
|
+
from google.cloud import storage
|
|
29
|
+
|
|
30
|
+
if os.environ.get("STORAGE_EMULATOR_HOST"):
|
|
31
|
+
from google.auth.credentials import AnonymousCredentials
|
|
32
|
+
|
|
33
|
+
storage_client = storage.Client(credentials=AnonymousCredentials())
|
|
34
|
+
else:
|
|
35
|
+
storage_client = storage.Client()
|
|
36
|
+
return storage_client._credentials
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class _GcsInputStream(io.BytesIO):
|
|
40
|
+
def __init__(
|
|
41
|
+
self, path: str, session: requests.Session, access_token_getter: Callable[[], str]
|
|
42
|
+
):
|
|
43
|
+
# Strip gs://
|
|
44
|
+
if path.startswith("gs://"):
|
|
45
|
+
path = path[5:]
|
|
46
|
+
bucket = path.split("/", 1)[0]
|
|
47
|
+
object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
|
|
48
|
+
url = f"https://storage.googleapis.com/{bucket}/{object_full_path}"
|
|
49
|
+
|
|
50
|
+
access_token = access_token_getter()
|
|
51
|
+
headers = {"Accept-Encoding": "identity"}
|
|
52
|
+
if access_token:
|
|
53
|
+
headers["Authorization"] = f"Bearer {access_token}"
|
|
54
|
+
|
|
55
|
+
response = session.get(
|
|
56
|
+
url,
|
|
57
|
+
headers=headers,
|
|
58
|
+
timeout=30,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if response.status_code != 200:
|
|
62
|
+
raise FileNotFoundError(
|
|
63
|
+
f"Unable to read '{path}' - status {response.status_code}: {response.text}"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
super().__init__(response.content)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class _GcsOutputStream(io.BytesIO):
|
|
70
|
+
def __init__(
|
|
71
|
+
self, path: str, session: requests.Session, access_token_getter: Callable[[], str]
|
|
72
|
+
):
|
|
73
|
+
super().__init__()
|
|
74
|
+
self._path = path
|
|
75
|
+
self._session = session
|
|
76
|
+
self._access_token_getter = access_token_getter
|
|
77
|
+
self._closed = False
|
|
78
|
+
|
|
79
|
+
def close(self):
|
|
80
|
+
if self._closed:
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
path = self._path
|
|
84
|
+
if path.startswith("gs://"):
|
|
85
|
+
path = path[5:]
|
|
86
|
+
|
|
87
|
+
bucket = path.split("/", 1)[0]
|
|
88
|
+
url = f"https://storage.googleapis.com/upload/storage/v1/b/{bucket}/o"
|
|
89
|
+
|
|
90
|
+
data = self.getvalue()
|
|
91
|
+
object_name = path[(len(bucket) + 1) :]
|
|
92
|
+
|
|
93
|
+
token = self._access_token_getter()
|
|
94
|
+
headers = {
|
|
95
|
+
"Content-Type": "application/octet-stream",
|
|
96
|
+
"Content-Length": str(len(data)),
|
|
97
|
+
}
|
|
98
|
+
if token:
|
|
99
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
100
|
+
|
|
101
|
+
response = self._session.post(
|
|
102
|
+
url,
|
|
103
|
+
params={"uploadType": "media", "name": object_name},
|
|
104
|
+
headers=headers,
|
|
105
|
+
data=data,
|
|
106
|
+
timeout=60,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if response.status_code not in (200, 201):
|
|
110
|
+
raise IOError(
|
|
111
|
+
f"Failed to write '{self._path}' - status {response.status_code}: {response.text}"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
self._closed = True
|
|
115
|
+
super().close()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class _GcsInputFile(InputFile):
|
|
119
|
+
def __init__(
|
|
120
|
+
self,
|
|
121
|
+
location: str,
|
|
122
|
+
session: requests.Session,
|
|
123
|
+
access_token_getter: Callable[[], str],
|
|
124
|
+
cache: OrderedDict = None,
|
|
125
|
+
):
|
|
126
|
+
# Check cache first
|
|
127
|
+
if cache is not None and location in cache:
|
|
128
|
+
# Move to end (most recently used)
|
|
129
|
+
cache.move_to_end(location)
|
|
130
|
+
data = cache[location]
|
|
131
|
+
super().__init__(location, data)
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
# read entire bytes via optimized session
|
|
135
|
+
try:
|
|
136
|
+
stream = _GcsInputStream(location, session, access_token_getter)
|
|
137
|
+
data = stream.read()
|
|
138
|
+
|
|
139
|
+
# Add to cache
|
|
140
|
+
if cache is not None:
|
|
141
|
+
cache[location] = data
|
|
142
|
+
# Evict oldest if cache exceeds MAX_CACHE_SIZE entries
|
|
143
|
+
if len(cache) > MAX_CACHE_SIZE:
|
|
144
|
+
cache.popitem(last=False)
|
|
145
|
+
|
|
146
|
+
super().__init__(location, data)
|
|
147
|
+
except FileNotFoundError:
|
|
148
|
+
super().__init__(location, None)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class _GcsOutputFile(OutputFile):
|
|
152
|
+
def __init__(
|
|
153
|
+
self, location: str, session: requests.Session, access_token_getter: Callable[[], str]
|
|
154
|
+
):
|
|
155
|
+
super().__init__(location)
|
|
156
|
+
self._location = location
|
|
157
|
+
self._session = session
|
|
158
|
+
self._access_token_getter = access_token_getter
|
|
159
|
+
|
|
160
|
+
def create(self):
|
|
161
|
+
return _GcsOutputStream(self._location, self._session, self._access_token_getter)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class GcsFileIO(FileIO):
|
|
165
|
+
"""Optimized HTTP-backed GCS FileIO.
|
|
166
|
+
|
|
167
|
+
Implements a blackhole/capture pattern for manifest files and exposes
|
|
168
|
+
`new_input`, `new_output`, `delete`, `exists`.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
def __init__(self):
|
|
172
|
+
# Track manifest paths and captured manifests
|
|
173
|
+
self.manifest_paths: list[str] = []
|
|
174
|
+
self.captured_manifests: list[tuple[str, bytes]] = []
|
|
175
|
+
|
|
176
|
+
# LRU cache for read operations (MAX_CACHE_SIZE files max)
|
|
177
|
+
self._read_cache: OrderedDict = OrderedDict()
|
|
178
|
+
|
|
179
|
+
# Prepare requests session and set up credential refresh helper (token may expire)
|
|
180
|
+
self._credentials = _get_storage_credentials()
|
|
181
|
+
self._access_token = None
|
|
182
|
+
|
|
183
|
+
def _refresh_credentials():
|
|
184
|
+
try:
|
|
185
|
+
if not self._credentials.valid:
|
|
186
|
+
req = Request()
|
|
187
|
+
self._credentials.refresh(req)
|
|
188
|
+
self._access_token = self._credentials.token
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.warning("Failed to refresh GCS credentials: %s", e)
|
|
191
|
+
self._access_token = None
|
|
192
|
+
|
|
193
|
+
self._refresh_credentials = _refresh_credentials
|
|
194
|
+
|
|
195
|
+
def get_access_token():
|
|
196
|
+
# Refresh credentials on demand to avoid using expired tokens
|
|
197
|
+
self._refresh_credentials()
|
|
198
|
+
return self._access_token
|
|
199
|
+
|
|
200
|
+
self.get_access_token = get_access_token
|
|
201
|
+
|
|
202
|
+
self._session = requests.session()
|
|
203
|
+
adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100)
|
|
204
|
+
self._session.mount("https://", adapter)
|
|
205
|
+
|
|
206
|
+
def new_input(self, location: str) -> InputFile:
|
|
207
|
+
return _GcsInputFile(location, self._session, self.get_access_token, self._read_cache)
|
|
208
|
+
|
|
209
|
+
def new_output(self, location: str) -> OutputFile:
|
|
210
|
+
logger.info(f"new_output -> {location}")
|
|
211
|
+
|
|
212
|
+
# Invalidate cache entry if present
|
|
213
|
+
self._read_cache.pop(location, None)
|
|
214
|
+
|
|
215
|
+
return _GcsOutputFile(location, self._session, self.get_access_token)
|
|
216
|
+
|
|
217
|
+
def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
|
|
218
|
+
if isinstance(location, (InputFile, OutputFile)):
|
|
219
|
+
location = location.location
|
|
220
|
+
|
|
221
|
+
# Invalidate cache entry if present
|
|
222
|
+
self._read_cache.pop(location, None)
|
|
223
|
+
|
|
224
|
+
path = location
|
|
225
|
+
if path.startswith("gs://"):
|
|
226
|
+
path = path[5:]
|
|
227
|
+
|
|
228
|
+
bucket = path.split("/", 1)[0]
|
|
229
|
+
object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
|
|
230
|
+
url = f"https://storage.googleapis.com/storage/v1/b/{bucket}/o/{object_full_path}"
|
|
231
|
+
|
|
232
|
+
token = self.get_access_token()
|
|
233
|
+
headers = {}
|
|
234
|
+
if token:
|
|
235
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
236
|
+
response = self._session.delete(url, headers=headers, timeout=10)
|
|
237
|
+
|
|
238
|
+
if response.status_code not in (204, 404):
|
|
239
|
+
raise IOError(f"Failed to delete '{location}' - status {response.status_code}")
|
|
240
|
+
|
|
241
|
+
def exists(self, location: str) -> bool:
|
|
242
|
+
path = location
|
|
243
|
+
if path.startswith("gs://"):
|
|
244
|
+
path = path[5:]
|
|
245
|
+
|
|
246
|
+
bucket = path.split("/", 1)[0]
|
|
247
|
+
object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
|
|
248
|
+
url = f"https://storage.googleapis.com/{bucket}/{object_full_path}"
|
|
249
|
+
|
|
250
|
+
token = self.get_access_token()
|
|
251
|
+
headers = {}
|
|
252
|
+
if token:
|
|
253
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
254
|
+
response = self._session.head(url, headers=headers, timeout=10)
|
|
255
|
+
return response.status_code == 200
|