pixeltable 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__init__.py +11 -1
  2. pixeltable/catalog/__init__.py +2 -1
  3. pixeltable/catalog/catalog.py +179 -63
  4. pixeltable/catalog/column.py +24 -20
  5. pixeltable/catalog/table.py +96 -124
  6. pixeltable/catalog/table_metadata.py +96 -0
  7. pixeltable/catalog/table_version.py +15 -6
  8. pixeltable/catalog/view.py +22 -22
  9. pixeltable/config.py +2 -0
  10. pixeltable/dataframe.py +3 -2
  11. pixeltable/env.py +43 -21
  12. pixeltable/exec/__init__.py +1 -0
  13. pixeltable/exec/aggregation_node.py +0 -1
  14. pixeltable/exec/cache_prefetch_node.py +74 -98
  15. pixeltable/exec/data_row_batch.py +2 -18
  16. pixeltable/exec/in_memory_data_node.py +1 -1
  17. pixeltable/exec/object_store_save_node.py +299 -0
  18. pixeltable/exec/sql_node.py +28 -33
  19. pixeltable/exprs/data_row.py +31 -25
  20. pixeltable/exprs/json_path.py +6 -5
  21. pixeltable/exprs/row_builder.py +6 -12
  22. pixeltable/functions/gemini.py +1 -1
  23. pixeltable/functions/openai.py +1 -1
  24. pixeltable/functions/video.py +5 -6
  25. pixeltable/globals.py +6 -7
  26. pixeltable/index/embedding_index.py +5 -8
  27. pixeltable/io/__init__.py +2 -1
  28. pixeltable/io/fiftyone.py +1 -1
  29. pixeltable/io/label_studio.py +4 -5
  30. pixeltable/io/lancedb.py +3 -0
  31. pixeltable/io/parquet.py +9 -89
  32. pixeltable/io/table_data_conduit.py +2 -2
  33. pixeltable/iterators/audio.py +1 -1
  34. pixeltable/iterators/document.py +10 -12
  35. pixeltable/iterators/video.py +1 -1
  36. pixeltable/metadata/schema.py +7 -0
  37. pixeltable/plan.py +26 -1
  38. pixeltable/share/packager.py +8 -2
  39. pixeltable/share/publish.py +3 -9
  40. pixeltable/type_system.py +1 -3
  41. pixeltable/utils/arrow.py +97 -2
  42. pixeltable/utils/dbms.py +31 -5
  43. pixeltable/utils/gcs_store.py +283 -0
  44. pixeltable/utils/lancedb.py +88 -0
  45. pixeltable/utils/local_store.py +316 -0
  46. pixeltable/utils/object_stores.py +497 -0
  47. pixeltable/utils/pytorch.py +5 -6
  48. pixeltable/utils/s3_store.py +354 -0
  49. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/METADATA +162 -127
  50. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/RECORD +53 -47
  51. pixeltable/utils/media_store.py +0 -248
  52. pixeltable/utils/s3.py +0 -17
  53. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
  54. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
  55. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
pixeltable/utils/dbms.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import abc
2
2
 
3
- from sqlalchemy import URL
3
+ import sqlalchemy as sql
4
4
 
5
5
 
6
6
  class Dbms(abc.ABC):
@@ -11,9 +11,9 @@ class Dbms(abc.ABC):
11
11
  name: str
12
12
  transaction_isolation_level: str
13
13
  version_index_type: str
14
- db_url: URL
14
+ db_url: sql.URL
15
15
 
16
- def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: URL) -> None:
16
+ def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: sql.URL) -> None:
17
17
  self.name = name
18
18
  self.transaction_isolation_level = transaction_isolation_level
19
19
  self.version_index_type = version_index_type
@@ -28,13 +28,18 @@ class Dbms(abc.ABC):
28
28
  @abc.abstractmethod
29
29
  def default_system_db_url(self) -> str: ...
30
30
 
31
+ @abc.abstractmethod
32
+ def create_vector_index(
33
+ self, index_name: str, index_value_sa_col: sql.schema.Column, conn: sql.Connection, metric: str
34
+ ) -> None: ...
35
+
31
36
 
32
37
  class PostgresqlDbms(Dbms):
33
38
  """
34
39
  Implements utilities to interact with Postgres database.
35
40
  """
36
41
 
37
- def __init__(self, db_url: URL):
42
+ def __init__(self, db_url: sql.URL):
38
43
  super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
39
44
 
40
45
  def drop_db_stmt(self, database: str) -> str:
@@ -47,13 +52,25 @@ class PostgresqlDbms(Dbms):
47
52
  a = self.db_url.set(database='postgres').render_as_string(hide_password=False)
48
53
  return a
49
54
 
55
+ def create_vector_index(
56
+ self, index_name: str, index_value_sa_col: sql.schema.Column, conn: sql.Connection, metric: str
57
+ ) -> None:
58
+ idx = sql.Index(
59
+ index_name,
60
+ index_value_sa_col,
61
+ postgresql_using='hnsw',
62
+ postgresql_with={'m': 16, 'ef_construction': 64},
63
+ postgresql_ops={index_value_sa_col.name: metric},
64
+ )
65
+ idx.create(bind=conn)
66
+
50
67
 
51
68
  class CockroachDbms(Dbms):
52
69
  """
53
70
  Implements utilities to interact with CockroachDb database.
54
71
  """
55
72
 
56
- def __init__(self, db_url: URL):
73
+ def __init__(self, db_url: sql.URL):
57
74
  super().__init__('cockroachdb', 'SERIALIZABLE', 'btree', db_url)
58
75
 
59
76
  def drop_db_stmt(self, database: str) -> str:
@@ -64,3 +81,12 @@ class CockroachDbms(Dbms):
64
81
 
65
82
  def default_system_db_url(self) -> str:
66
83
  return self.db_url.set(database='defaultdb').render_as_string(hide_password=False)
84
+
85
+ def create_vector_index(
86
+ self, index_name: str, index_value_sa_col: sql.schema.Column, conn: sql.Connection, metric: str
87
+ ) -> None:
88
+ create_index_sql = sql.text(
89
+ f"""CREATE VECTOR INDEX {index_name} ON {index_value_sa_col.table.name}
90
+ ({index_value_sa_col.name} {metric})"""
91
+ )
92
+ conn.execute(create_index_sql)
@@ -0,0 +1,283 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ import urllib.parse
6
+ import uuid
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Any, Iterator, Optional
9
+
10
+ from google.api_core.exceptions import GoogleAPIError
11
+ from google.cloud import storage # type: ignore[attr-defined]
12
+ from google.cloud.exceptions import Forbidden, NotFound
13
+ from google.cloud.storage.client import Client # type: ignore[import-untyped]
14
+
15
+ from pixeltable import env, exceptions as excs
16
+ from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress, StorageTarget
17
+
18
+ if TYPE_CHECKING:
19
+ from pixeltable.catalog import Column
20
+
21
+ _logger = logging.getLogger('pixeltable')
22
+
23
+
24
+ @env.register_client('gcs_store')
25
+ def _() -> 'Client':
26
+ """Create and return a GCS client, using default credentials if available,
27
+ otherwise creating an anonymous client for public buckets.
28
+ """
29
+ try:
30
+ # Create a client with default credentials
31
+ # Note that if the default credentials have expired, gcloud will still create a client,
32
+ # which will report the expiry error when it is used.
33
+ # To create and use an anonymous client, expired credentials must be removed.
34
+ # For application default credentials, delete the file in ~/.config/gcloud/, or
35
+ # gcloud auth application-default revoke
36
+ # OR
37
+ # For service account keys, you must delete the downloaded key file.
38
+ client = storage.Client()
39
+ return client
40
+ except Exception:
41
+ # If no credentials are found, create an anonymous client which can be used for public buckets.
42
+ client = storage.Client.create_anonymous_client()
43
+ return client
44
+
45
+
46
+ class GCSStore(ObjectStoreBase):
47
+ """Class to handle Google Cloud Storage operations."""
48
+
49
+ # URI of the GCS bucket in the format gs://bucket_name/prefix/
50
+ # Always ends with a slash
51
+ __base_uri: str
52
+
53
+ # bucket name extracted from the URI
54
+ __bucket_name: str
55
+
56
+ # prefix path within the bucket, either empty or ending with a slash
57
+ __prefix_name: str
58
+
59
+ # The parsed form of the given destination address
60
+ soa: StorageObjectAddress
61
+
62
+ def __init__(self, soa: StorageObjectAddress):
63
+ assert soa.storage_target == StorageTarget.GCS_STORE, f'Expected storage_target "gs", got {soa.storage_target}'
64
+ self.soa = soa
65
+ self.__base_uri = soa.prefix_free_uri + soa.prefix
66
+ self.__bucket_name = soa.container
67
+ self.__prefix_name = soa.prefix
68
+
69
+ @classmethod
70
+ def client(cls) -> 'Client':
71
+ """Return the GCS client."""
72
+ return env.Env.get().get_client('gcs_store')
73
+
74
+ @property
75
+ def bucket_name(self) -> str:
76
+ """Return the bucket name from the base URI."""
77
+ return self.__bucket_name
78
+
79
+ @property
80
+ def prefix(self) -> str:
81
+ """Return the prefix from the base URI."""
82
+ return self.__prefix_name
83
+
84
+ def validate(self, error_col_name: str) -> Optional[str]:
85
+ """
86
+ Checks if the URI exists.
87
+
88
+ Returns:
89
+ str: The base URI if the GCS bucket exists and is accessible, None otherwise.
90
+ """
91
+ try:
92
+ client = self.client()
93
+ bucket = client.bucket(self.bucket_name)
94
+ blobs = bucket.list_blobs(max_results=1)
95
+ # This will raise an exception if the destination doesn't exist or cannot be listed
96
+ _ = list(blobs) # Force evaluation to check access
97
+ return self.__base_uri
98
+ except (NotFound, Forbidden, GoogleAPIError) as e:
99
+ self.handle_gcs_error(e, self.bucket_name, f'validate bucket {error_col_name}')
100
+ return None
101
+
102
+ def _prepare_uri_raw(self, tbl_id: uuid.UUID, col_id: int, tbl_version: int, ext: Optional[str] = None) -> str:
103
+ """
104
+ Construct a new, unique URI for a persisted media file.
105
+ """
106
+ prefix, filename = ObjectPath.create_prefix_raw(tbl_id, col_id, tbl_version, ext)
107
+ parent = f'{self.__base_uri}{prefix}'
108
+ return f'{parent}/{filename}'
109
+
110
+ def _prepare_uri(self, col: Column, ext: Optional[str] = None) -> str:
111
+ """
112
+ Construct a new, unique URI for a persisted media file.
113
+ """
114
+ assert col.tbl is not None, 'Column must be associated with a table'
115
+ return self._prepare_uri_raw(col.tbl.id, col.id, col.tbl.version, ext=ext)
116
+
117
+ def copy_local_file(self, col: Column, src_path: Path) -> str:
118
+ """Copy a local file, and return its new URL"""
119
+ new_file_uri = self._prepare_uri(col, ext=src_path.suffix)
120
+ parsed = urllib.parse.urlparse(new_file_uri)
121
+ blob_name = parsed.path.lstrip('/')
122
+
123
+ try:
124
+ client = self.client()
125
+ bucket = client.bucket(self.bucket_name)
126
+ blob = bucket.blob(blob_name)
127
+ blob.upload_from_filename(str(src_path))
128
+ _logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
129
+ return new_file_uri
130
+ except GoogleAPIError as e:
131
+ self.handle_gcs_error(e, self.bucket_name, f'upload file {src_path}')
132
+ raise
133
+
134
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
135
+ """Copies an object to a local file. Thread safe"""
136
+ try:
137
+ client = self.client()
138
+ bucket = client.bucket(self.bucket_name)
139
+ blob = bucket.blob(self.prefix + src_path)
140
+ blob.download_to_filename(str(dest_path))
141
+ except GoogleAPIError as e:
142
+ self.handle_gcs_error(e, self.bucket_name, f'download file {src_path}')
143
+ raise
144
+
145
+ def _get_filtered_objects(self, bucket: Any, tbl_id: uuid.UUID, tbl_version: Optional[int] = None) -> Iterator:
146
+ """Private method to get filtered objects for a table, optionally filtered by version.
147
+
148
+ Args:
149
+ tbl_id: Table UUID to filter by
150
+ tbl_version: Optional table version to filter by
151
+
152
+ Returns:
153
+ Tuple of (iterator over GCS objects matching the criteria, bucket object)
154
+ """
155
+ table_prefix = ObjectPath.table_prefix(tbl_id)
156
+ prefix = f'{self.prefix}{table_prefix}/'
157
+
158
+ if tbl_version is None:
159
+ # Return all blobs with the table prefix
160
+ blob_iterator = bucket.list_blobs(prefix=prefix)
161
+ else:
162
+ # Filter by both table_id and table_version using the ObjectPath pattern
163
+ # Pattern: tbl_id_col_id_version_uuid
164
+ version_pattern = re.compile(rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*')
165
+ # Return filtered collection - this still uses lazy loading
166
+ all_blobs = bucket.list_blobs(prefix=prefix)
167
+ blob_iterator = (blob for blob in all_blobs if version_pattern.match(blob.name.split('/')[-1]))
168
+
169
+ return blob_iterator
170
+
171
+ def count(self, tbl_id: uuid.UUID, tbl_version: Optional[int] = None) -> int:
172
+ """Count the number of files belonging to tbl_id. If tbl_version is not None,
173
+ count only those files belonging to the specified tbl_version.
174
+
175
+ Args:
176
+ tbl_id: Table UUID to count objects for
177
+ tbl_version: Optional table version to filter by
178
+
179
+ Returns:
180
+ Number of objects matching the criteria
181
+ """
182
+ assert tbl_id is not None
183
+
184
+ try:
185
+ client = self.client()
186
+ bucket = client.bucket(self.bucket_name)
187
+
188
+ blob_iterator = self._get_filtered_objects(bucket, tbl_id, tbl_version)
189
+
190
+ return sum(1 for _ in blob_iterator)
191
+
192
+ except GoogleAPIError as e:
193
+ self.handle_gcs_error(e, self.bucket_name, f'setup iterator {self.prefix}')
194
+ raise
195
+
196
+ def delete(self, tbl_id: uuid.UUID, tbl_version: Optional[int] = None) -> int:
197
+ """Delete all files belonging to tbl_id. If tbl_version is not None, delete
198
+ only those files belonging to the specified tbl_version.
199
+
200
+ Args:
201
+ tbl_id: Table UUID to delete objects for
202
+ tbl_version: Optional table version to filter by
203
+
204
+ Returns:
205
+ Number of objects deleted
206
+ """
207
+ assert tbl_id is not None
208
+ total_deleted = 0
209
+
210
+ try:
211
+ client = self.client()
212
+ bucket = client.bucket(self.bucket_name)
213
+ blob_iterator = self._get_filtered_objects(bucket, tbl_id, tbl_version)
214
+
215
+ # Collect blob names for batch deletion
216
+ blobs_to_delete = []
217
+
218
+ for blob in blob_iterator:
219
+ blobs_to_delete.append(blob)
220
+
221
+ # Process in batches for efficiency
222
+ if len(blobs_to_delete) >= 100:
223
+ with client.batch():
224
+ for b in blobs_to_delete:
225
+ b.delete()
226
+ total_deleted += len(blobs_to_delete)
227
+ blobs_to_delete = []
228
+
229
+ # Delete any remaining blobs in the final batch
230
+ if len(blobs_to_delete) > 0:
231
+ with client.batch():
232
+ for b in blobs_to_delete:
233
+ b.delete()
234
+ total_deleted += len(blobs_to_delete)
235
+
236
+ return total_deleted
237
+
238
+ except GoogleAPIError as e:
239
+ self.handle_gcs_error(e, self.bucket_name, f'deleting with {self.prefix}')
240
+ raise
241
+
242
+ def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
243
+ """Return a list of objects found in the specified destination bucket.
244
+ Each returned object includes the full set of prefixes.
245
+ if return_uri is True, full URI's are returned; otherwise, just the object keys.
246
+ """
247
+ p = self.soa.prefix_free_uri if return_uri else ''
248
+ gcs_client = self.client()
249
+ r: list[str] = []
250
+
251
+ try:
252
+ bucket = gcs_client.bucket(self.bucket_name)
253
+ # List blobs with the given prefix, limiting to n_max
254
+ blobs = bucket.list_blobs(prefix=self.prefix, max_results=n_max)
255
+
256
+ for blob in blobs:
257
+ r.append(f'{p}{blob.name}')
258
+ if len(r) >= n_max:
259
+ break
260
+
261
+ except GoogleAPIError as e:
262
+ self.handle_gcs_error(e, self.bucket_name, f'list objects from {self.prefix}')
263
+ return r
264
+
265
+ @classmethod
266
+ def handle_gcs_error(cls, e: Exception, bucket_name: str, operation: str = '', *, ignore_404: bool = False) -> None:
267
+ """Handle GCS-specific errors and convert them to appropriate exceptions"""
268
+ if isinstance(e, NotFound):
269
+ if ignore_404:
270
+ return
271
+ raise excs.Error(f'Bucket or object {bucket_name} not found during {operation}: {str(e)!r}')
272
+ elif isinstance(e, Forbidden):
273
+ raise excs.Error(f'Access denied to bucket {bucket_name} during {operation}: {str(e)!r}')
274
+ elif isinstance(e, GoogleAPIError):
275
+ # Handle other Google API errors
276
+ error_message = str(e)
277
+ if 'Precondition' in error_message:
278
+ raise excs.Error(f'Precondition failed for bucket {bucket_name} during {operation}: {error_message}')
279
+ else:
280
+ raise excs.Error(f'Error during {operation} in bucket {bucket_name}: {error_message}')
281
+ else:
282
+ # Generic error handling
283
+ raise excs.Error(f'Unexpected error during {operation} in bucket {bucket_name}: {str(e)!r}')
@@ -0,0 +1,88 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import shutil
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ import pixeltable as pxt
9
+ import pixeltable.exceptions as excs
10
+ from pixeltable.catalog import Catalog
11
+ from pixeltable.env import Env
12
+
13
+ _logger = logging.getLogger('pixeltable')
14
+
15
+
16
+ def export_lancedb(
17
+ table_or_df: pxt.Table | pxt.DataFrame,
18
+ db_uri: Path,
19
+ table_name: str,
20
+ batch_size_bytes: int = 128 * 2**20,
21
+ if_exists: Literal['error', 'overwrite', 'append'] = 'error',
22
+ ) -> None:
23
+ """
24
+ Exports a dataframe's data to a LanceDB table.
25
+
26
+ This utilizes LanceDB's streaming interface for efficient table creation, via a sequence of in-memory pyarrow
27
+ `RecordBatches`, the size of which can be controlled with the `batch_size_bytes` parameter.
28
+
29
+ __Requirements:__
30
+
31
+ - `pip install lancedb`
32
+
33
+ Args:
34
+ table_or_df : Table or Dataframe to export.
35
+ db_uri: Local Path to the LanceDB database.
36
+ table_name : Name of the table in the LanceDB database.
37
+ batch_size_bytes : Maximum size in bytes for each batch.
38
+ if_exists: Determines the behavior if the table already exists. Must be one of the following:
39
+
40
+ - `'error'`: raise an error
41
+ - `'overwrite'`: overwrite the existing table
42
+ - `'append'`: append to the existing table
43
+ """
44
+ Env.get().require_package('lancedb')
45
+
46
+ import lancedb # type: ignore[import-untyped]
47
+
48
+ from pixeltable.utils.arrow import to_arrow_schema, to_record_batches
49
+
50
+ if if_exists not in ('error', 'overwrite', 'append'):
51
+ raise excs.Error("export_lancedb(): 'if_exists' must be one of: ['error', 'overwrite', 'append']")
52
+
53
+ df: pxt.DataFrame
54
+ if isinstance(table_or_df, pxt.catalog.Table):
55
+ df = table_or_df._df()
56
+ else:
57
+ df = table_or_df
58
+
59
+ db_exists = False
60
+ if db_uri.exists():
61
+ if not db_uri.is_dir():
62
+ raise excs.Error(f"export_lancedb(): '{db_uri!s}' exists and is not a directory")
63
+ db_exists = True
64
+
65
+ try:
66
+ db = lancedb.connect(str(db_uri))
67
+ lance_tbl: lancedb.LanceTable | None = None
68
+ try:
69
+ lance_tbl = db.open_table(table_name)
70
+ if if_exists == 'error':
71
+ raise excs.Error(f'export_lancedb(): table {table_name!r} already exists in {db_uri!r}')
72
+ except ValueError:
73
+ # table doesn't exist
74
+ pass
75
+
76
+ with Catalog.get().begin_xact(for_write=False):
77
+ if lance_tbl is None or if_exists == 'overwrite':
78
+ mode = 'overwrite' if lance_tbl is not None else 'create'
79
+ arrow_schema = to_arrow_schema(df.schema)
80
+ _ = db.create_table(table_name, to_record_batches(df, batch_size_bytes), schema=arrow_schema, mode=mode)
81
+ else:
82
+ lance_tbl.add(to_record_batches(df, batch_size_bytes))
83
+
84
+ except Exception as e:
85
+ # cleanup
86
+ if not db_exists:
87
+ shutil.rmtree(db_uri)
88
+ raise e