pixeltable 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__init__.py +11 -1
  2. pixeltable/catalog/__init__.py +2 -1
  3. pixeltable/catalog/catalog.py +179 -63
  4. pixeltable/catalog/column.py +24 -20
  5. pixeltable/catalog/table.py +96 -124
  6. pixeltable/catalog/table_metadata.py +96 -0
  7. pixeltable/catalog/table_version.py +15 -6
  8. pixeltable/catalog/view.py +22 -22
  9. pixeltable/config.py +2 -0
  10. pixeltable/dataframe.py +3 -2
  11. pixeltable/env.py +43 -21
  12. pixeltable/exec/__init__.py +1 -0
  13. pixeltable/exec/aggregation_node.py +0 -1
  14. pixeltable/exec/cache_prefetch_node.py +74 -98
  15. pixeltable/exec/data_row_batch.py +2 -18
  16. pixeltable/exec/in_memory_data_node.py +1 -1
  17. pixeltable/exec/object_store_save_node.py +299 -0
  18. pixeltable/exec/sql_node.py +28 -33
  19. pixeltable/exprs/data_row.py +31 -25
  20. pixeltable/exprs/json_path.py +6 -5
  21. pixeltable/exprs/row_builder.py +6 -12
  22. pixeltable/functions/gemini.py +1 -1
  23. pixeltable/functions/openai.py +1 -1
  24. pixeltable/functions/video.py +5 -6
  25. pixeltable/globals.py +6 -7
  26. pixeltable/index/embedding_index.py +5 -8
  27. pixeltable/io/__init__.py +2 -1
  28. pixeltable/io/fiftyone.py +1 -1
  29. pixeltable/io/label_studio.py +4 -5
  30. pixeltable/io/lancedb.py +3 -0
  31. pixeltable/io/parquet.py +9 -89
  32. pixeltable/io/table_data_conduit.py +2 -2
  33. pixeltable/iterators/audio.py +1 -1
  34. pixeltable/iterators/document.py +10 -12
  35. pixeltable/iterators/video.py +1 -1
  36. pixeltable/metadata/schema.py +7 -0
  37. pixeltable/plan.py +26 -1
  38. pixeltable/share/packager.py +8 -2
  39. pixeltable/share/publish.py +3 -9
  40. pixeltable/type_system.py +1 -3
  41. pixeltable/utils/arrow.py +97 -2
  42. pixeltable/utils/dbms.py +31 -5
  43. pixeltable/utils/gcs_store.py +283 -0
  44. pixeltable/utils/lancedb.py +88 -0
  45. pixeltable/utils/local_store.py +316 -0
  46. pixeltable/utils/object_stores.py +497 -0
  47. pixeltable/utils/pytorch.py +5 -6
  48. pixeltable/utils/s3_store.py +354 -0
  49. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/METADATA +162 -127
  50. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/RECORD +53 -47
  51. pixeltable/utils/media_store.py +0 -248
  52. pixeltable/utils/s3.py +0 -17
  53. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
  54. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
  55. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,354 @@
1
+ import logging
2
+ import re
3
+ import threading
4
+ import urllib.parse
5
+ import uuid
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, Iterator, NamedTuple, Optional
8
+
9
+ import boto3
10
+ import botocore
11
+ from botocore.exceptions import ClientError
12
+
13
+ from pixeltable import env, exceptions as excs
14
+ from pixeltable.config import Config
15
+ from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress, StorageTarget
16
+
17
+ if TYPE_CHECKING:
18
+ from botocore.exceptions import ClientError
19
+
20
+ from pixeltable.catalog import Column
21
+
22
+ _logger = logging.getLogger('pixeltable')
23
+
24
+ client_lock = threading.Lock()
25
+
26
+
27
+ class R2ClientDict(NamedTuple):
28
+ """Container for actual R2 access objects (clients, resources)
29
+ Thread-safe, protected by the module lock 'client_lock'"""
30
+
31
+ profile: Optional[str] # profile used to find credentials
32
+ clients: dict[str, Any] # Dictionary of URI to client object attached to the URI
33
+
34
+
35
+ @env.register_client('r2')
36
+ def _() -> Any:
37
+ profile_name = Config.get().get_string_value('r2_profile')
38
+ return R2ClientDict(profile=profile_name, clients={})
39
+
40
+
41
+ @env.register_client('r2_resource')
42
+ def _() -> Any:
43
+ profile_name = Config.get().get_string_value('r2_profile')
44
+ return R2ClientDict(profile=profile_name, clients={})
45
+
46
+
47
+ @env.register_client('s3')
48
+ def _() -> Any:
49
+ profile_name = Config.get().get_string_value('s3_profile')
50
+ return S3Store.create_boto_client(profile_name=profile_name)
51
+
52
+
53
+ @env.register_client('s3_resource')
54
+ def _() -> Any:
55
+ profile_name = Config.get().get_string_value('s3_profile')
56
+ return S3Store.create_boto_resource(profile_name=profile_name)
57
+
58
+
59
+ class S3Store(ObjectStoreBase):
60
+ """Wrapper for an s3 storage target with all needed methods."""
61
+
62
+ # URI of the S3 bucket in the format s3://bucket_name/prefix/
63
+ # Always ends with a slash
64
+ __base_uri: str
65
+
66
+ # bucket name extracted from the URI
67
+ __bucket_name: str
68
+
69
+ # prefix path within the bucket, either empty or ending with a slash
70
+ __prefix_name: str
71
+
72
+ soa: StorageObjectAddress
73
+
74
+ def __init__(self, soa: StorageObjectAddress):
75
+ self.soa = soa
76
+ self.__bucket_name = self.soa.container
77
+ self.__prefix_name = self.soa.prefix
78
+ assert self.soa.storage_target in {StorageTarget.R2_STORE, StorageTarget.S3_STORE}, (
79
+ f'Expected storage_target "s3" or "r2", got {self.soa.storage_target}'
80
+ )
81
+ self.__base_uri = self.soa.prefix_free_uri + self.soa.prefix
82
+
83
+ def client(self) -> Any:
84
+ """Return a client to access the store."""
85
+ if self.soa.storage_target == StorageTarget.R2_STORE:
86
+ cd = env.Env.get().get_client('r2')
87
+ with client_lock:
88
+ if self.soa.container_free_uri not in cd.clients:
89
+ cd.clients[self.soa.container_free_uri] = S3Store.create_boto_client(
90
+ profile_name=cd.profile,
91
+ extra_args={'endpoint_url': self.soa.container_free_uri, 'region_name': 'auto'},
92
+ )
93
+ return cd.clients[self.soa.container_free_uri]
94
+ if self.soa.storage_target == StorageTarget.S3_STORE:
95
+ return env.Env.get().get_client('s3')
96
+ raise AssertionError(f'Unexpected storage_target: {self.soa.storage_target}')
97
+
98
+ def get_resource(self) -> Any:
99
+ if self.soa.storage_target == StorageTarget.R2_STORE:
100
+ cd = env.Env.get().get_client('r2_resource')
101
+ with client_lock:
102
+ if self.soa.container_free_uri not in cd.clients:
103
+ cd.clients[self.soa.container_free_uri] = S3Store.create_boto_resource(
104
+ profile_name=cd.profile,
105
+ extra_args={'endpoint_url': self.soa.container_free_uri, 'region_name': 'auto'},
106
+ )
107
+ return cd.clients[self.soa.container_free_uri]
108
+ if self.soa.storage_target == StorageTarget.S3_STORE:
109
+ return env.Env.get().get_client('s3_resource')
110
+ raise AssertionError(f'Unexpected storage_target: {self.soa.storage_target}')
111
+
112
+ @property
113
+ def bucket_name(self) -> str:
114
+ """Return the bucket name from the base URI."""
115
+ return self.__bucket_name
116
+
117
+ @property
118
+ def prefix(self) -> str:
119
+ """Return the prefix from the base URI."""
120
+ return self.__prefix_name
121
+
122
+ def validate(self, error_col_name: str) -> Optional[str]:
123
+ """
124
+ Checks if the URI exists.
125
+
126
+ Returns:
127
+ bool: True if the S3 URI exists and is accessible, False otherwise.
128
+ """
129
+ try:
130
+ self.client().head_bucket(Bucket=self.bucket_name)
131
+ return self.__base_uri
132
+ except ClientError as e:
133
+ self.handle_s3_error(e, self.bucket_name, f'validate bucket {error_col_name}')
134
+ return None
135
+
136
+ def _prepare_uri_raw(self, tbl_id: uuid.UUID, col_id: int, tbl_version: int, ext: Optional[str] = None) -> str:
137
+ """
138
+ Construct a new, unique URI for a persisted media file.
139
+ """
140
+ prefix, filename = ObjectPath.create_prefix_raw(tbl_id, col_id, tbl_version, ext)
141
+ parent = f'{self.__base_uri}{prefix}'
142
+ return f'{parent}/{filename}'
143
+
144
+ def _prepare_uri(self, col: 'Column', ext: Optional[str] = None) -> str:
145
+ """
146
+ Construct a new, unique URI for a persisted media file.
147
+ """
148
+ assert col.tbl is not None, 'Column must be associated with a table'
149
+ return self._prepare_uri_raw(col.tbl.id, col.id, col.tbl.version, ext=ext)
150
+
151
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
152
+ """Copies an object to a local file. Thread safe."""
153
+ try:
154
+ self.client().download_file(Bucket=self.bucket_name, Key=self.prefix + src_path, Filename=str(dest_path))
155
+ except ClientError as e:
156
+ self.handle_s3_error(e, self.bucket_name, f'download file {src_path}')
157
+ raise
158
+
159
+ def copy_local_file(self, col: 'Column', src_path: Path) -> str:
160
+ """Copy a local file, and return its new URL"""
161
+ new_file_uri = self._prepare_uri(col, ext=src_path.suffix)
162
+ parsed = urllib.parse.urlparse(new_file_uri)
163
+ key = parsed.path.lstrip('/')
164
+ if self.soa.storage_target == StorageTarget.R2_STORE:
165
+ key = key.split('/', 1)[-1] # Remove the bucket name from the key for R2
166
+ try:
167
+ _logger.debug(f'Media Storage: copying {src_path} to {new_file_uri} : Key: {key}')
168
+ self.client().upload_file(Filename=str(src_path), Bucket=self.bucket_name, Key=key)
169
+ _logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
170
+ return new_file_uri
171
+ except ClientError as e:
172
+ self.handle_s3_error(e, self.bucket_name, f'setup iterator {self.prefix}')
173
+ raise
174
+
175
+ def _get_filtered_objects(self, tbl_id: uuid.UUID, tbl_version: Optional[int] = None) -> tuple[Iterator, Any]:
176
+ """Private method to get filtered objects for a table, optionally filtered by version.
177
+
178
+ Args:
179
+ tbl_id: Table UUID to filter by
180
+ tbl_version: Optional table version to filter by
181
+
182
+ Returns:
183
+ Tuple of (iterator over S3 objects matching the criteria, bucket object)
184
+ """
185
+ # Use ObjectPath to construct the prefix for this table
186
+ table_prefix = ObjectPath.table_prefix(tbl_id)
187
+ prefix = f'{self.prefix}{table_prefix}/'
188
+
189
+ try:
190
+ # Use S3 resource interface for filtering
191
+ s3_resource = self.get_resource()
192
+ bucket = s3_resource.Bucket(self.bucket_name)
193
+
194
+ if tbl_version is None:
195
+ # Return all objects with the table prefix
196
+ object_iterator = bucket.objects.filter(Prefix=prefix)
197
+ else:
198
+ # Filter by both table_id and table_version using the ObjectPath pattern
199
+ # Pattern: tbl_id_col_id_version_uuid
200
+ version_pattern = re.compile(
201
+ rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*'
202
+ )
203
+ # Return filtered collection - this still uses lazy loading
204
+ object_iterator = (
205
+ obj for obj in bucket.objects.filter(Prefix=prefix) if version_pattern.match(obj.key.split('/')[-1])
206
+ )
207
+
208
+ return object_iterator, bucket
209
+
210
+ except ClientError as e:
211
+ self.handle_s3_error(e, self.bucket_name, f'setup iterator {self.prefix}')
212
+ raise
213
+
214
+ def count(self, tbl_id: uuid.UUID, tbl_version: Optional[int] = None) -> int:
215
+ """Count the number of files belonging to tbl_id. If tbl_version is not None,
216
+ count only those files belonging to the specified tbl_version.
217
+
218
+ Args:
219
+ tbl_id: Table UUID to count objects for
220
+ tbl_version: Optional table version to filter by
221
+
222
+ Returns:
223
+ Number of objects matching the criteria
224
+ """
225
+ assert tbl_id is not None
226
+
227
+ object_iterator, _ = self._get_filtered_objects(tbl_id, tbl_version)
228
+
229
+ return sum(1 for _ in object_iterator)
230
+
231
+ def delete(self, tbl_id: uuid.UUID, tbl_version: Optional[int] = None) -> int:
232
+ """Delete all files belonging to tbl_id. If tbl_version is not None, delete
233
+ only those files belonging to the specified tbl_version.
234
+
235
+ Args:
236
+ tbl_id: Table UUID to delete objects for
237
+ tbl_version: Optional table version to filter by
238
+
239
+ Returns:
240
+ Number of objects deleted
241
+ """
242
+ assert tbl_id is not None
243
+
244
+ # Use shared method to get filtered objects and bucket
245
+ object_iterator, bucket = self._get_filtered_objects(tbl_id, tbl_version)
246
+
247
+ total_deleted = 0
248
+
249
+ try:
250
+ objects_to_delete = []
251
+
252
+ # Process objects in batches as we iterate (memory efficient)
253
+ for obj in object_iterator:
254
+ objects_to_delete.append({'Key': obj.key})
255
+
256
+ # Delete in batches of 1000 (S3 limit)
257
+ if len(objects_to_delete) >= 1000:
258
+ bucket.delete_objects(Delete={'Objects': objects_to_delete, 'Quiet': True})
259
+ total_deleted += len(objects_to_delete)
260
+ objects_to_delete = []
261
+
262
+ # Delete any remaining objects in the final batch
263
+ if len(objects_to_delete) > 0:
264
+ bucket.delete_objects(Delete={'Objects': objects_to_delete, 'Quiet': True})
265
+ total_deleted += len(objects_to_delete)
266
+
267
+ return total_deleted
268
+
269
+ except ClientError as e:
270
+ self.handle_s3_error(e, self.bucket_name, f'deleting with {self.prefix}')
271
+ raise
272
+
273
+ def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
274
+ """Return a list of objects found in the specified destination bucket.
275
+ Each returned object includes the full set of prefixes.
276
+ if return_uri is True, full URI's are returned; otherwise, just the object keys.
277
+ """
278
+ p = self.soa.prefix_free_uri if return_uri else ''
279
+
280
+ s3_client = self.client()
281
+ r: list[str] = []
282
+ try:
283
+ # Use paginator to handle more than 1000 objects
284
+ paginator = s3_client.get_paginator('list_objects_v2')
285
+ for page in paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix):
286
+ if 'Contents' not in page:
287
+ continue
288
+ for obj in page['Contents']:
289
+ if len(r) >= n_max:
290
+ return r
291
+ r.append(f'{p}{obj["Key"]}')
292
+ except ClientError as e:
293
+ self.handle_s3_error(e, self.bucket_name, f'list objects from {self.prefix}')
294
+ return r
295
+
296
+ @classmethod
297
+ def handle_s3_error(
298
+ cls, e: 'ClientError', bucket_name: str, operation: str = '', *, ignore_404: bool = False
299
+ ) -> None:
300
+ error_code = e.response.get('Error', {}).get('Code')
301
+ error_message = e.response.get('Error', {}).get('Message', str(e))
302
+ if ignore_404 and error_code == '404':
303
+ return
304
+ if error_code == '404':
305
+ raise excs.Error(f'Bucket {bucket_name} not found during {operation}: {error_message}')
306
+ elif error_code == '403':
307
+ raise excs.Error(f'Access denied to bucket {bucket_name} during {operation}: {error_message}')
308
+ elif error_code == 'PreconditionFailed' or 'PreconditionFailed' in error_message:
309
+ raise excs.Error(f'Precondition failed for bucket {bucket_name} during {operation}: {error_message}')
310
+ else:
311
+ raise excs.Error(f'Error during {operation} in bucket {bucket_name}: {error_code} - {error_message}')
312
+
313
+ @classmethod
314
+ def create_boto_session(cls, profile_name: Optional[str] = None) -> Any:
315
+ """Create a boto session using the defined profile"""
316
+ if profile_name:
317
+ try:
318
+ _logger.info(f'Creating boto session with profile {profile_name}')
319
+ session = boto3.Session(profile_name=profile_name)
320
+ return session
321
+ except Exception as e:
322
+ _logger.info(f'Error occurred while creating boto session with profile {profile_name}: {e}')
323
+ return boto3.Session()
324
+
325
+ @classmethod
326
+ def create_boto_client(cls, profile_name: Optional[str] = None, extra_args: Optional[dict[str, Any]] = None) -> Any:
327
+ config_args: dict[str, Any] = {
328
+ 'max_pool_connections': 30,
329
+ 'connect_timeout': 15,
330
+ 'read_timeout': 30,
331
+ 'retries': {'max_attempts': 3, 'mode': 'adaptive'},
332
+ }
333
+
334
+ session = cls.create_boto_session(profile_name)
335
+
336
+ try:
337
+ # Check if credentials are available
338
+ session.get_credentials().get_frozen_credentials()
339
+ config = botocore.config.Config(**config_args)
340
+ return session.client('s3', config=config, **(extra_args or {})) # credentials are available
341
+ except Exception as e:
342
+ _logger.info(f'Error occurred while creating S3 client: {e}, fallback to unsigned mode')
343
+ # No credentials available, use unsigned mode
344
+ config_args = config_args.copy()
345
+ config_args['signature_version'] = botocore.UNSIGNED
346
+ config = botocore.config.Config(**config_args)
347
+ return boto3.client('s3', config=config)
348
+
349
+ @classmethod
350
+ def create_boto_resource(
351
+ cls, profile_name: Optional[str] = None, extra_args: Optional[dict[str, Any]] = None
352
+ ) -> Any:
353
+ # Create a session using the defined profile
354
+ return cls.create_boto_session(profile_name).resource('s3', **(extra_args or {}))