opteryx-catalog 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opteryx-catalog might be problematic. Click here for more details.

@@ -0,0 +1,255 @@
1
+ """
2
+ Optimized GCS FileIO for opteryx_catalog.iops
3
+ """
4
+
5
+ import io
6
+ import logging
7
+ import os
8
+ import urllib.parse
9
+ from collections import OrderedDict
10
+ from typing import Callable
11
+ from typing import Union
12
+
13
+ import requests
14
+ from google.auth.transport.requests import Request
15
+ from requests.adapters import HTTPAdapter
16
+
17
+ from .base import FileIO
18
+ from .base import InputFile
19
+ from .base import OutputFile
20
+
21
+ # we keep a local cache of recently read files
22
+ MAX_CACHE_SIZE: int = 32
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def _get_storage_credentials():
28
+ from google.cloud import storage
29
+
30
+ if os.environ.get("STORAGE_EMULATOR_HOST"):
31
+ from google.auth.credentials import AnonymousCredentials
32
+
33
+ storage_client = storage.Client(credentials=AnonymousCredentials())
34
+ else:
35
+ storage_client = storage.Client()
36
+ return storage_client._credentials
37
+
38
+
39
+ class _GcsInputStream(io.BytesIO):
40
+ def __init__(
41
+ self, path: str, session: requests.Session, access_token_getter: Callable[[], str]
42
+ ):
43
+ # Strip gs://
44
+ if path.startswith("gs://"):
45
+ path = path[5:]
46
+ bucket = path.split("/", 1)[0]
47
+ object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
48
+ url = f"https://storage.googleapis.com/{bucket}/{object_full_path}"
49
+
50
+ access_token = access_token_getter()
51
+ headers = {"Accept-Encoding": "identity"}
52
+ if access_token:
53
+ headers["Authorization"] = f"Bearer {access_token}"
54
+
55
+ response = session.get(
56
+ url,
57
+ headers=headers,
58
+ timeout=30,
59
+ )
60
+
61
+ if response.status_code != 200:
62
+ raise FileNotFoundError(
63
+ f"Unable to read '{path}' - status {response.status_code}: {response.text}"
64
+ )
65
+
66
+ super().__init__(response.content)
67
+
68
+
69
+ class _GcsOutputStream(io.BytesIO):
70
+ def __init__(
71
+ self, path: str, session: requests.Session, access_token_getter: Callable[[], str]
72
+ ):
73
+ super().__init__()
74
+ self._path = path
75
+ self._session = session
76
+ self._access_token_getter = access_token_getter
77
+ self._closed = False
78
+
79
+ def close(self):
80
+ if self._closed:
81
+ return
82
+
83
+ path = self._path
84
+ if path.startswith("gs://"):
85
+ path = path[5:]
86
+
87
+ bucket = path.split("/", 1)[0]
88
+ url = f"https://storage.googleapis.com/upload/storage/v1/b/{bucket}/o"
89
+
90
+ data = self.getvalue()
91
+ object_name = path[(len(bucket) + 1) :]
92
+
93
+ token = self._access_token_getter()
94
+ headers = {
95
+ "Content-Type": "application/octet-stream",
96
+ "Content-Length": str(len(data)),
97
+ }
98
+ if token:
99
+ headers["Authorization"] = f"Bearer {token}"
100
+
101
+ response = self._session.post(
102
+ url,
103
+ params={"uploadType": "media", "name": object_name},
104
+ headers=headers,
105
+ data=data,
106
+ timeout=60,
107
+ )
108
+
109
+ if response.status_code not in (200, 201):
110
+ raise IOError(
111
+ f"Failed to write '{self._path}' - status {response.status_code}: {response.text}"
112
+ )
113
+
114
+ self._closed = True
115
+ super().close()
116
+
117
+
118
+ class _GcsInputFile(InputFile):
119
+ def __init__(
120
+ self,
121
+ location: str,
122
+ session: requests.Session,
123
+ access_token_getter: Callable[[], str],
124
+ cache: OrderedDict = None,
125
+ ):
126
+ # Check cache first
127
+ if cache is not None and location in cache:
128
+ # Move to end (most recently used)
129
+ cache.move_to_end(location)
130
+ data = cache[location]
131
+ super().__init__(location, data)
132
+ return
133
+
134
+ # read entire bytes via optimized session
135
+ try:
136
+ stream = _GcsInputStream(location, session, access_token_getter)
137
+ data = stream.read()
138
+
139
+ # Add to cache
140
+ if cache is not None:
141
+ cache[location] = data
142
+ # Evict oldest if cache exceeds MAX_CACHE_SIZE entries
143
+ if len(cache) > MAX_CACHE_SIZE:
144
+ cache.popitem(last=False)
145
+
146
+ super().__init__(location, data)
147
+ except FileNotFoundError:
148
+ super().__init__(location, None)
149
+
150
+
151
+ class _GcsOutputFile(OutputFile):
152
+ def __init__(
153
+ self, location: str, session: requests.Session, access_token_getter: Callable[[], str]
154
+ ):
155
+ super().__init__(location)
156
+ self._location = location
157
+ self._session = session
158
+ self._access_token_getter = access_token_getter
159
+
160
+ def create(self):
161
+ return _GcsOutputStream(self._location, self._session, self._access_token_getter)
162
+
163
+
164
+ class GcsFileIO(FileIO):
165
+ """Optimized HTTP-backed GCS FileIO.
166
+
167
+ Implements a blackhole/capture pattern for manifest files and exposes
168
+ `new_input`, `new_output`, `delete`, `exists`.
169
+ """
170
+
171
+ def __init__(self):
172
+ # Track manifest paths and captured manifests
173
+ self.manifest_paths: list[str] = []
174
+ self.captured_manifests: list[tuple[str, bytes]] = []
175
+
176
+ # LRU cache for read operations (MAX_CACHE_SIZE files max)
177
+ self._read_cache: OrderedDict = OrderedDict()
178
+
179
+ # Prepare requests session and set up credential refresh helper (token may expire)
180
+ self._credentials = _get_storage_credentials()
181
+ self._access_token = None
182
+
183
+ def _refresh_credentials():
184
+ try:
185
+ if not self._credentials.valid:
186
+ req = Request()
187
+ self._credentials.refresh(req)
188
+ self._access_token = self._credentials.token
189
+ except Exception as e:
190
+ logger.warning("Failed to refresh GCS credentials: %s", e)
191
+ self._access_token = None
192
+
193
+ self._refresh_credentials = _refresh_credentials
194
+
195
+ def get_access_token():
196
+ # Refresh credentials on demand to avoid using expired tokens
197
+ self._refresh_credentials()
198
+ return self._access_token
199
+
200
+ self.get_access_token = get_access_token
201
+
202
+ self._session = requests.session()
203
+ adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100)
204
+ self._session.mount("https://", adapter)
205
+
206
+ def new_input(self, location: str) -> InputFile:
207
+ return _GcsInputFile(location, self._session, self.get_access_token, self._read_cache)
208
+
209
+ def new_output(self, location: str) -> OutputFile:
210
+ logger.info(f"new_output -> {location}")
211
+
212
+ # Invalidate cache entry if present
213
+ self._read_cache.pop(location, None)
214
+
215
+ return _GcsOutputFile(location, self._session, self.get_access_token)
216
+
217
+ def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
218
+ if isinstance(location, (InputFile, OutputFile)):
219
+ location = location.location
220
+
221
+ # Invalidate cache entry if present
222
+ self._read_cache.pop(location, None)
223
+
224
+ path = location
225
+ if path.startswith("gs://"):
226
+ path = path[5:]
227
+
228
+ bucket = path.split("/", 1)[0]
229
+ object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
230
+ url = f"https://storage.googleapis.com/storage/v1/b/{bucket}/o/{object_full_path}"
231
+
232
+ token = self.get_access_token()
233
+ headers = {}
234
+ if token:
235
+ headers["Authorization"] = f"Bearer {token}"
236
+ response = self._session.delete(url, headers=headers, timeout=10)
237
+
238
+ if response.status_code not in (204, 404):
239
+ raise IOError(f"Failed to delete '{location}' - status {response.status_code}")
240
+
241
+ def exists(self, location: str) -> bool:
242
+ path = location
243
+ if path.startswith("gs://"):
244
+ path = path[5:]
245
+
246
+ bucket = path.split("/", 1)[0]
247
+ object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
248
+ url = f"https://storage.googleapis.com/{bucket}/{object_full_path}"
249
+
250
+ token = self.get_access_token()
251
+ headers = {}
252
+ if token:
253
+ headers["Authorization"] = f"Bearer {token}"
254
+ response = self._session.head(url, headers=headers, timeout=10)
255
+ return response.status_code == 200