pixeltable 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +11 -1
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/catalog.py +179 -63
- pixeltable/catalog/column.py +24 -20
- pixeltable/catalog/table.py +96 -124
- pixeltable/catalog/table_metadata.py +96 -0
- pixeltable/catalog/table_version.py +15 -6
- pixeltable/catalog/view.py +22 -22
- pixeltable/config.py +2 -0
- pixeltable/dataframe.py +3 -2
- pixeltable/env.py +43 -21
- pixeltable/exec/__init__.py +1 -0
- pixeltable/exec/aggregation_node.py +0 -1
- pixeltable/exec/cache_prefetch_node.py +74 -98
- pixeltable/exec/data_row_batch.py +2 -18
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/object_store_save_node.py +299 -0
- pixeltable/exec/sql_node.py +28 -33
- pixeltable/exprs/data_row.py +31 -25
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/row_builder.py +6 -12
- pixeltable/functions/gemini.py +1 -1
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/video.py +5 -6
- pixeltable/globals.py +6 -7
- pixeltable/index/embedding_index.py +5 -8
- pixeltable/io/__init__.py +2 -1
- pixeltable/io/fiftyone.py +1 -1
- pixeltable/io/label_studio.py +4 -5
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/parquet.py +9 -89
- pixeltable/io/table_data_conduit.py +2 -2
- pixeltable/iterators/audio.py +1 -1
- pixeltable/iterators/document.py +10 -12
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/schema.py +7 -0
- pixeltable/plan.py +26 -1
- pixeltable/share/packager.py +8 -2
- pixeltable/share/publish.py +3 -9
- pixeltable/type_system.py +1 -3
- pixeltable/utils/arrow.py +97 -2
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/object_stores.py +497 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +354 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/METADATA +162 -127
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/RECORD +53 -47
- pixeltable/utils/media_store.py +0 -248
- pixeltable/utils/s3.py +0 -17
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
import threading
|
|
4
|
+
import urllib.parse
|
|
5
|
+
import uuid
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Iterator, NamedTuple, Optional
|
|
8
|
+
|
|
9
|
+
import boto3
|
|
10
|
+
import botocore
|
|
11
|
+
from botocore.exceptions import ClientError
|
|
12
|
+
|
|
13
|
+
from pixeltable import env, exceptions as excs
|
|
14
|
+
from pixeltable.config import Config
|
|
15
|
+
from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress, StorageTarget
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from botocore.exceptions import ClientError
|
|
19
|
+
|
|
20
|
+
from pixeltable.catalog import Column
|
|
21
|
+
|
|
22
|
+
_logger = logging.getLogger('pixeltable')
|
|
23
|
+
|
|
24
|
+
client_lock = threading.Lock()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class R2ClientDict(NamedTuple):
|
|
28
|
+
"""Container for actual R2 access objects (clients, resources)
|
|
29
|
+
Thread-safe, protected by the module lock 'client_lock'"""
|
|
30
|
+
|
|
31
|
+
profile: Optional[str] # profile used to find credentials
|
|
32
|
+
clients: dict[str, Any] # Dictionary of URI to client object attached to the URI
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@env.register_client('r2')
|
|
36
|
+
def _() -> Any:
|
|
37
|
+
profile_name = Config.get().get_string_value('r2_profile')
|
|
38
|
+
return R2ClientDict(profile=profile_name, clients={})
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@env.register_client('r2_resource')
|
|
42
|
+
def _() -> Any:
|
|
43
|
+
profile_name = Config.get().get_string_value('r2_profile')
|
|
44
|
+
return R2ClientDict(profile=profile_name, clients={})
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@env.register_client('s3')
|
|
48
|
+
def _() -> Any:
|
|
49
|
+
profile_name = Config.get().get_string_value('s3_profile')
|
|
50
|
+
return S3Store.create_boto_client(profile_name=profile_name)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@env.register_client('s3_resource')
|
|
54
|
+
def _() -> Any:
|
|
55
|
+
profile_name = Config.get().get_string_value('s3_profile')
|
|
56
|
+
return S3Store.create_boto_resource(profile_name=profile_name)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class S3Store(ObjectStoreBase):
|
|
60
|
+
"""Wrapper for an s3 storage target with all needed methods."""
|
|
61
|
+
|
|
62
|
+
# URI of the S3 bucket in the format s3://bucket_name/prefix/
|
|
63
|
+
# Always ends with a slash
|
|
64
|
+
__base_uri: str
|
|
65
|
+
|
|
66
|
+
# bucket name extracted from the URI
|
|
67
|
+
__bucket_name: str
|
|
68
|
+
|
|
69
|
+
# prefix path within the bucket, either empty or ending with a slash
|
|
70
|
+
__prefix_name: str
|
|
71
|
+
|
|
72
|
+
soa: StorageObjectAddress
|
|
73
|
+
|
|
74
|
+
def __init__(self, soa: StorageObjectAddress):
|
|
75
|
+
self.soa = soa
|
|
76
|
+
self.__bucket_name = self.soa.container
|
|
77
|
+
self.__prefix_name = self.soa.prefix
|
|
78
|
+
assert self.soa.storage_target in {StorageTarget.R2_STORE, StorageTarget.S3_STORE}, (
|
|
79
|
+
f'Expected storage_target "s3" or "r2", got {self.soa.storage_target}'
|
|
80
|
+
)
|
|
81
|
+
self.__base_uri = self.soa.prefix_free_uri + self.soa.prefix
|
|
82
|
+
|
|
83
|
+
def client(self) -> Any:
|
|
84
|
+
"""Return a client to access the store."""
|
|
85
|
+
if self.soa.storage_target == StorageTarget.R2_STORE:
|
|
86
|
+
cd = env.Env.get().get_client('r2')
|
|
87
|
+
with client_lock:
|
|
88
|
+
if self.soa.container_free_uri not in cd.clients:
|
|
89
|
+
cd.clients[self.soa.container_free_uri] = S3Store.create_boto_client(
|
|
90
|
+
profile_name=cd.profile,
|
|
91
|
+
extra_args={'endpoint_url': self.soa.container_free_uri, 'region_name': 'auto'},
|
|
92
|
+
)
|
|
93
|
+
return cd.clients[self.soa.container_free_uri]
|
|
94
|
+
if self.soa.storage_target == StorageTarget.S3_STORE:
|
|
95
|
+
return env.Env.get().get_client('s3')
|
|
96
|
+
raise AssertionError(f'Unexpected storage_target: {self.soa.storage_target}')
|
|
97
|
+
|
|
98
|
+
def get_resource(self) -> Any:
|
|
99
|
+
if self.soa.storage_target == StorageTarget.R2_STORE:
|
|
100
|
+
cd = env.Env.get().get_client('r2_resource')
|
|
101
|
+
with client_lock:
|
|
102
|
+
if self.soa.container_free_uri not in cd.clients:
|
|
103
|
+
cd.clients[self.soa.container_free_uri] = S3Store.create_boto_resource(
|
|
104
|
+
profile_name=cd.profile,
|
|
105
|
+
extra_args={'endpoint_url': self.soa.container_free_uri, 'region_name': 'auto'},
|
|
106
|
+
)
|
|
107
|
+
return cd.clients[self.soa.container_free_uri]
|
|
108
|
+
if self.soa.storage_target == StorageTarget.S3_STORE:
|
|
109
|
+
return env.Env.get().get_client('s3_resource')
|
|
110
|
+
raise AssertionError(f'Unexpected storage_target: {self.soa.storage_target}')
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def bucket_name(self) -> str:
|
|
114
|
+
"""Return the bucket name from the base URI."""
|
|
115
|
+
return self.__bucket_name
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def prefix(self) -> str:
|
|
119
|
+
"""Return the prefix from the base URI."""
|
|
120
|
+
return self.__prefix_name
|
|
121
|
+
|
|
122
|
+
def validate(self, error_col_name: str) -> Optional[str]:
|
|
123
|
+
"""
|
|
124
|
+
Checks if the URI exists.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
bool: True if the S3 URI exists and is accessible, False otherwise.
|
|
128
|
+
"""
|
|
129
|
+
try:
|
|
130
|
+
self.client().head_bucket(Bucket=self.bucket_name)
|
|
131
|
+
return self.__base_uri
|
|
132
|
+
except ClientError as e:
|
|
133
|
+
self.handle_s3_error(e, self.bucket_name, f'validate bucket {error_col_name}')
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
def _prepare_uri_raw(self, tbl_id: uuid.UUID, col_id: int, tbl_version: int, ext: Optional[str] = None) -> str:
|
|
137
|
+
"""
|
|
138
|
+
Construct a new, unique URI for a persisted media file.
|
|
139
|
+
"""
|
|
140
|
+
prefix, filename = ObjectPath.create_prefix_raw(tbl_id, col_id, tbl_version, ext)
|
|
141
|
+
parent = f'{self.__base_uri}{prefix}'
|
|
142
|
+
return f'{parent}/{filename}'
|
|
143
|
+
|
|
144
|
+
def _prepare_uri(self, col: 'Column', ext: Optional[str] = None) -> str:
|
|
145
|
+
"""
|
|
146
|
+
Construct a new, unique URI for a persisted media file.
|
|
147
|
+
"""
|
|
148
|
+
assert col.tbl is not None, 'Column must be associated with a table'
|
|
149
|
+
return self._prepare_uri_raw(col.tbl.id, col.id, col.tbl.version, ext=ext)
|
|
150
|
+
|
|
151
|
+
def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
|
|
152
|
+
"""Copies an object to a local file. Thread safe."""
|
|
153
|
+
try:
|
|
154
|
+
self.client().download_file(Bucket=self.bucket_name, Key=self.prefix + src_path, Filename=str(dest_path))
|
|
155
|
+
except ClientError as e:
|
|
156
|
+
self.handle_s3_error(e, self.bucket_name, f'download file {src_path}')
|
|
157
|
+
raise
|
|
158
|
+
|
|
159
|
+
def copy_local_file(self, col: 'Column', src_path: Path) -> str:
|
|
160
|
+
"""Copy a local file, and return its new URL"""
|
|
161
|
+
new_file_uri = self._prepare_uri(col, ext=src_path.suffix)
|
|
162
|
+
parsed = urllib.parse.urlparse(new_file_uri)
|
|
163
|
+
key = parsed.path.lstrip('/')
|
|
164
|
+
if self.soa.storage_target == StorageTarget.R2_STORE:
|
|
165
|
+
key = key.split('/', 1)[-1] # Remove the bucket name from the key for R2
|
|
166
|
+
try:
|
|
167
|
+
_logger.debug(f'Media Storage: copying {src_path} to {new_file_uri} : Key: {key}')
|
|
168
|
+
self.client().upload_file(Filename=str(src_path), Bucket=self.bucket_name, Key=key)
|
|
169
|
+
_logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
|
|
170
|
+
return new_file_uri
|
|
171
|
+
except ClientError as e:
|
|
172
|
+
self.handle_s3_error(e, self.bucket_name, f'setup iterator {self.prefix}')
|
|
173
|
+
raise
|
|
174
|
+
|
|
175
|
+
def _get_filtered_objects(self, tbl_id: uuid.UUID, tbl_version: Optional[int] = None) -> tuple[Iterator, Any]:
|
|
176
|
+
"""Private method to get filtered objects for a table, optionally filtered by version.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
tbl_id: Table UUID to filter by
|
|
180
|
+
tbl_version: Optional table version to filter by
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Tuple of (iterator over S3 objects matching the criteria, bucket object)
|
|
184
|
+
"""
|
|
185
|
+
# Use ObjectPath to construct the prefix for this table
|
|
186
|
+
table_prefix = ObjectPath.table_prefix(tbl_id)
|
|
187
|
+
prefix = f'{self.prefix}{table_prefix}/'
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
# Use S3 resource interface for filtering
|
|
191
|
+
s3_resource = self.get_resource()
|
|
192
|
+
bucket = s3_resource.Bucket(self.bucket_name)
|
|
193
|
+
|
|
194
|
+
if tbl_version is None:
|
|
195
|
+
# Return all objects with the table prefix
|
|
196
|
+
object_iterator = bucket.objects.filter(Prefix=prefix)
|
|
197
|
+
else:
|
|
198
|
+
# Filter by both table_id and table_version using the ObjectPath pattern
|
|
199
|
+
# Pattern: tbl_id_col_id_version_uuid
|
|
200
|
+
version_pattern = re.compile(
|
|
201
|
+
rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*'
|
|
202
|
+
)
|
|
203
|
+
# Return filtered collection - this still uses lazy loading
|
|
204
|
+
object_iterator = (
|
|
205
|
+
obj for obj in bucket.objects.filter(Prefix=prefix) if version_pattern.match(obj.key.split('/')[-1])
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
return object_iterator, bucket
|
|
209
|
+
|
|
210
|
+
except ClientError as e:
|
|
211
|
+
self.handle_s3_error(e, self.bucket_name, f'setup iterator {self.prefix}')
|
|
212
|
+
raise
|
|
213
|
+
|
|
214
|
+
def count(self, tbl_id: uuid.UUID, tbl_version: Optional[int] = None) -> int:
|
|
215
|
+
"""Count the number of files belonging to tbl_id. If tbl_version is not None,
|
|
216
|
+
count only those files belonging to the specified tbl_version.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
tbl_id: Table UUID to count objects for
|
|
220
|
+
tbl_version: Optional table version to filter by
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Number of objects matching the criteria
|
|
224
|
+
"""
|
|
225
|
+
assert tbl_id is not None
|
|
226
|
+
|
|
227
|
+
object_iterator, _ = self._get_filtered_objects(tbl_id, tbl_version)
|
|
228
|
+
|
|
229
|
+
return sum(1 for _ in object_iterator)
|
|
230
|
+
|
|
231
|
+
def delete(self, tbl_id: uuid.UUID, tbl_version: Optional[int] = None) -> int:
|
|
232
|
+
"""Delete all files belonging to tbl_id. If tbl_version is not None, delete
|
|
233
|
+
only those files belonging to the specified tbl_version.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
tbl_id: Table UUID to delete objects for
|
|
237
|
+
tbl_version: Optional table version to filter by
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Number of objects deleted
|
|
241
|
+
"""
|
|
242
|
+
assert tbl_id is not None
|
|
243
|
+
|
|
244
|
+
# Use shared method to get filtered objects and bucket
|
|
245
|
+
object_iterator, bucket = self._get_filtered_objects(tbl_id, tbl_version)
|
|
246
|
+
|
|
247
|
+
total_deleted = 0
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
objects_to_delete = []
|
|
251
|
+
|
|
252
|
+
# Process objects in batches as we iterate (memory efficient)
|
|
253
|
+
for obj in object_iterator:
|
|
254
|
+
objects_to_delete.append({'Key': obj.key})
|
|
255
|
+
|
|
256
|
+
# Delete in batches of 1000 (S3 limit)
|
|
257
|
+
if len(objects_to_delete) >= 1000:
|
|
258
|
+
bucket.delete_objects(Delete={'Objects': objects_to_delete, 'Quiet': True})
|
|
259
|
+
total_deleted += len(objects_to_delete)
|
|
260
|
+
objects_to_delete = []
|
|
261
|
+
|
|
262
|
+
# Delete any remaining objects in the final batch
|
|
263
|
+
if len(objects_to_delete) > 0:
|
|
264
|
+
bucket.delete_objects(Delete={'Objects': objects_to_delete, 'Quiet': True})
|
|
265
|
+
total_deleted += len(objects_to_delete)
|
|
266
|
+
|
|
267
|
+
return total_deleted
|
|
268
|
+
|
|
269
|
+
except ClientError as e:
|
|
270
|
+
self.handle_s3_error(e, self.bucket_name, f'deleting with {self.prefix}')
|
|
271
|
+
raise
|
|
272
|
+
|
|
273
|
+
def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
|
|
274
|
+
"""Return a list of objects found in the specified destination bucket.
|
|
275
|
+
Each returned object includes the full set of prefixes.
|
|
276
|
+
if return_uri is True, full URI's are returned; otherwise, just the object keys.
|
|
277
|
+
"""
|
|
278
|
+
p = self.soa.prefix_free_uri if return_uri else ''
|
|
279
|
+
|
|
280
|
+
s3_client = self.client()
|
|
281
|
+
r: list[str] = []
|
|
282
|
+
try:
|
|
283
|
+
# Use paginator to handle more than 1000 objects
|
|
284
|
+
paginator = s3_client.get_paginator('list_objects_v2')
|
|
285
|
+
for page in paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix):
|
|
286
|
+
if 'Contents' not in page:
|
|
287
|
+
continue
|
|
288
|
+
for obj in page['Contents']:
|
|
289
|
+
if len(r) >= n_max:
|
|
290
|
+
return r
|
|
291
|
+
r.append(f'{p}{obj["Key"]}')
|
|
292
|
+
except ClientError as e:
|
|
293
|
+
self.handle_s3_error(e, self.bucket_name, f'list objects from {self.prefix}')
|
|
294
|
+
return r
|
|
295
|
+
|
|
296
|
+
@classmethod
|
|
297
|
+
def handle_s3_error(
|
|
298
|
+
cls, e: 'ClientError', bucket_name: str, operation: str = '', *, ignore_404: bool = False
|
|
299
|
+
) -> None:
|
|
300
|
+
error_code = e.response.get('Error', {}).get('Code')
|
|
301
|
+
error_message = e.response.get('Error', {}).get('Message', str(e))
|
|
302
|
+
if ignore_404 and error_code == '404':
|
|
303
|
+
return
|
|
304
|
+
if error_code == '404':
|
|
305
|
+
raise excs.Error(f'Bucket {bucket_name} not found during {operation}: {error_message}')
|
|
306
|
+
elif error_code == '403':
|
|
307
|
+
raise excs.Error(f'Access denied to bucket {bucket_name} during {operation}: {error_message}')
|
|
308
|
+
elif error_code == 'PreconditionFailed' or 'PreconditionFailed' in error_message:
|
|
309
|
+
raise excs.Error(f'Precondition failed for bucket {bucket_name} during {operation}: {error_message}')
|
|
310
|
+
else:
|
|
311
|
+
raise excs.Error(f'Error during {operation} in bucket {bucket_name}: {error_code} - {error_message}')
|
|
312
|
+
|
|
313
|
+
@classmethod
|
|
314
|
+
def create_boto_session(cls, profile_name: Optional[str] = None) -> Any:
|
|
315
|
+
"""Create a boto session using the defined profile"""
|
|
316
|
+
if profile_name:
|
|
317
|
+
try:
|
|
318
|
+
_logger.info(f'Creating boto session with profile {profile_name}')
|
|
319
|
+
session = boto3.Session(profile_name=profile_name)
|
|
320
|
+
return session
|
|
321
|
+
except Exception as e:
|
|
322
|
+
_logger.info(f'Error occurred while creating boto session with profile {profile_name}: {e}')
|
|
323
|
+
return boto3.Session()
|
|
324
|
+
|
|
325
|
+
@classmethod
|
|
326
|
+
def create_boto_client(cls, profile_name: Optional[str] = None, extra_args: Optional[dict[str, Any]] = None) -> Any:
|
|
327
|
+
config_args: dict[str, Any] = {
|
|
328
|
+
'max_pool_connections': 30,
|
|
329
|
+
'connect_timeout': 15,
|
|
330
|
+
'read_timeout': 30,
|
|
331
|
+
'retries': {'max_attempts': 3, 'mode': 'adaptive'},
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
session = cls.create_boto_session(profile_name)
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
# Check if credentials are available
|
|
338
|
+
session.get_credentials().get_frozen_credentials()
|
|
339
|
+
config = botocore.config.Config(**config_args)
|
|
340
|
+
return session.client('s3', config=config, **(extra_args or {})) # credentials are available
|
|
341
|
+
except Exception as e:
|
|
342
|
+
_logger.info(f'Error occurred while creating S3 client: {e}, fallback to unsigned mode')
|
|
343
|
+
# No credentials available, use unsigned mode
|
|
344
|
+
config_args = config_args.copy()
|
|
345
|
+
config_args['signature_version'] = botocore.UNSIGNED
|
|
346
|
+
config = botocore.config.Config(**config_args)
|
|
347
|
+
return boto3.client('s3', config=config)
|
|
348
|
+
|
|
349
|
+
@classmethod
|
|
350
|
+
def create_boto_resource(
|
|
351
|
+
cls, profile_name: Optional[str] = None, extra_args: Optional[dict[str, Any]] = None
|
|
352
|
+
) -> Any:
|
|
353
|
+
# Create a session using the defined profile
|
|
354
|
+
return cls.create_boto_session(profile_name).resource('s3', **(extra_args or {}))
|