pixeltable 0.4.18__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/_version.py +1 -0
- pixeltable/catalog/catalog.py +119 -100
- pixeltable/catalog/column.py +104 -115
- pixeltable/catalog/globals.py +1 -2
- pixeltable/catalog/insertable_table.py +44 -49
- pixeltable/catalog/path.py +3 -4
- pixeltable/catalog/schema_object.py +4 -4
- pixeltable/catalog/table.py +118 -122
- pixeltable/catalog/table_metadata.py +6 -6
- pixeltable/catalog/table_version.py +322 -257
- pixeltable/catalog/table_version_handle.py +4 -4
- pixeltable/catalog/table_version_path.py +9 -10
- pixeltable/catalog/tbl_ops.py +9 -3
- pixeltable/catalog/view.py +34 -28
- pixeltable/config.py +14 -10
- pixeltable/dataframe.py +68 -77
- pixeltable/env.py +74 -64
- pixeltable/exec/aggregation_node.py +6 -6
- pixeltable/exec/cache_prefetch_node.py +10 -10
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +4 -5
- pixeltable/exec/exec_node.py +5 -5
- pixeltable/exec/expr_eval/evaluators.py +6 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
- pixeltable/exec/expr_eval/globals.py +6 -6
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +11 -11
- pixeltable/exec/in_memory_data_node.py +2 -2
- pixeltable/exec/object_store_save_node.py +14 -17
- pixeltable/exec/sql_node.py +25 -25
- pixeltable/exprs/arithmetic_expr.py +4 -4
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/column_property_ref.py +3 -3
- pixeltable/exprs/column_ref.py +61 -74
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +3 -3
- pixeltable/exprs/data_row.py +12 -12
- pixeltable/exprs/expr.py +41 -31
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +3 -3
- pixeltable/exprs/function_call.py +14 -14
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +8 -8
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +6 -6
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +14 -14
- pixeltable/exprs/rowid_ref.py +8 -8
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +2 -2
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +3 -3
- pixeltable/func/function.py +15 -17
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +2 -2
- pixeltable/func/query_template_function.py +16 -16
- pixeltable/func/signature.py +14 -14
- pixeltable/func/tools.py +11 -11
- pixeltable/func/udf.py +16 -18
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +7 -7
- pixeltable/functions/audio.py +76 -0
- pixeltable/functions/bedrock.py +6 -6
- pixeltable/functions/deepseek.py +4 -4
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +6 -6
- pixeltable/functions/globals.py +12 -12
- pixeltable/functions/groq.py +4 -4
- pixeltable/functions/huggingface.py +18 -20
- pixeltable/functions/image.py +7 -10
- pixeltable/functions/llama_cpp.py +7 -7
- pixeltable/functions/math.py +2 -3
- pixeltable/functions/mistralai.py +3 -3
- pixeltable/functions/ollama.py +9 -9
- pixeltable/functions/openai.py +21 -21
- pixeltable/functions/openrouter.py +7 -7
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +7 -8
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/video.py +2 -24
- pixeltable/functions/vision.py +6 -6
- pixeltable/functions/whisper.py +7 -7
- pixeltable/functions/whisperx.py +16 -16
- pixeltable/globals.py +52 -36
- pixeltable/index/base.py +12 -8
- pixeltable/index/btree.py +19 -22
- pixeltable/index/embedding_index.py +30 -39
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/external_store.py +13 -16
- pixeltable/io/fiftyone.py +5 -5
- pixeltable/io/globals.py +5 -5
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +12 -12
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +12 -12
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/audio.py +2 -2
- pixeltable/iterators/video.py +8 -13
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_38.py +2 -2
- pixeltable/metadata/converters/convert_39.py +1 -2
- pixeltable/metadata/converters/util.py +11 -13
- pixeltable/metadata/schema.py +22 -21
- pixeltable/metadata/utils.py +2 -6
- pixeltable/mypy/mypy_plugin.py +5 -5
- pixeltable/plan.py +30 -28
- pixeltable/share/packager.py +7 -7
- pixeltable/share/publish.py +3 -3
- pixeltable/store.py +125 -61
- pixeltable/type_system.py +43 -46
- pixeltable/utils/__init__.py +1 -2
- pixeltable/utils/arrow.py +4 -4
- pixeltable/utils/av.py +8 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +1 -2
- pixeltable/utils/dbms.py +15 -19
- pixeltable/utils/description_helper.py +2 -3
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +2 -2
- pixeltable/utils/filecache.py +5 -5
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +9 -9
- pixeltable/utils/local_store.py +17 -17
- pixeltable/utils/object_stores.py +59 -43
- pixeltable/utils/s3_store.py +35 -30
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/METADATA +1 -1
- pixeltable-0.4.19.dist-info/RECORD +213 -0
- pixeltable/__version__.py +0 -3
- pixeltable-0.4.18.dist-info/RECORD +0 -211
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
import threading
|
|
4
|
+
import uuid
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Iterator
|
|
7
|
+
|
|
8
|
+
from azure.core.exceptions import AzureError
|
|
9
|
+
|
|
10
|
+
from pixeltable import env, exceptions as excs
|
|
11
|
+
from pixeltable.config import Config
|
|
12
|
+
from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from azure.storage.blob import BlobProperties, BlobServiceClient
|
|
16
|
+
|
|
17
|
+
from pixeltable.catalog import Column
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_logger = logging.getLogger('pixeltable')
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
client_lock = threading.Lock()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@env.register_client('azure_blob')
|
|
27
|
+
def _() -> dict[str, 'BlobServiceClient']:
|
|
28
|
+
return {}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AzureBlobStore(ObjectStoreBase):
|
|
32
|
+
"""Class to handle Azure Blob Storage operations."""
|
|
33
|
+
|
|
34
|
+
# TODO: This needs to be redesigned to use asyncio.
|
|
35
|
+
|
|
36
|
+
# URI of the Azure Blob Storage container
|
|
37
|
+
# Always ends with a slash
|
|
38
|
+
__base_uri: str
|
|
39
|
+
|
|
40
|
+
# Storage account name
|
|
41
|
+
__account_name: str
|
|
42
|
+
|
|
43
|
+
# Container name extracted from the URI
|
|
44
|
+
__container_name: str
|
|
45
|
+
|
|
46
|
+
# Prefix path within the container, either empty or ending with a slash
|
|
47
|
+
__prefix_name: str
|
|
48
|
+
|
|
49
|
+
# URI scheme (wasb, wasbs, abfs, abfss, https)
|
|
50
|
+
__scheme: str
|
|
51
|
+
|
|
52
|
+
soa: StorageObjectAddress
|
|
53
|
+
|
|
54
|
+
def __init__(self, soa: StorageObjectAddress):
|
|
55
|
+
self.soa = soa
|
|
56
|
+
self.__scheme = soa.scheme
|
|
57
|
+
self.__account_name = soa.account
|
|
58
|
+
self.__container_name = soa.container
|
|
59
|
+
self.__prefix_name = soa.prefix
|
|
60
|
+
|
|
61
|
+
# Reconstruct base URI in normalized format
|
|
62
|
+
self.__base_uri = self.soa.prefix_free_uri + self.__prefix_name
|
|
63
|
+
_logger.info(
|
|
64
|
+
f'Initialized AzureBlobStore with base URI: {self.__base_uri},',
|
|
65
|
+
f'account: {self.__account_name}, container: {self.__container_name}, prefix: {self.__prefix_name}',
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def client(self) -> 'BlobServiceClient':
|
|
69
|
+
"""Return the Azure Blob Storage client."""
|
|
70
|
+
client_dict: dict[str, 'BlobServiceClient'] = env.Env.get().get_client('azure_blob')
|
|
71
|
+
with client_lock:
|
|
72
|
+
uri = self.soa.container_free_uri
|
|
73
|
+
if uri not in client_dict:
|
|
74
|
+
storage_account_name = Config.get().get_string_value('storage_account_name', section='azure')
|
|
75
|
+
storage_account_key = Config.get().get_string_value('storage_account_key', section='azure')
|
|
76
|
+
if (storage_account_name is None) != (storage_account_key is None):
|
|
77
|
+
raise excs.Error(
|
|
78
|
+
"Azure 'storage_account_name' and 'storage_account_key' must be specified together."
|
|
79
|
+
)
|
|
80
|
+
if storage_account_name is None or storage_account_name != self.__account_name:
|
|
81
|
+
# Attempt a connection to a public resource, with no account key
|
|
82
|
+
client_dict[uri] = self.create_client(endpoint_url=uri)
|
|
83
|
+
else:
|
|
84
|
+
client_dict[uri] = self.create_client(
|
|
85
|
+
endpoint_url=uri, account_name=self.__account_name, account_key=storage_account_key
|
|
86
|
+
)
|
|
87
|
+
return client_dict[uri]
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def account_name(self) -> str:
|
|
91
|
+
"""Return the storage account name."""
|
|
92
|
+
return self.__account_name
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def container_name(self) -> str:
|
|
96
|
+
"""Return the container name from the base URI."""
|
|
97
|
+
return self.__container_name
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def prefix(self) -> str:
|
|
101
|
+
"""Return the prefix from the base URI."""
|
|
102
|
+
return self.__prefix_name
|
|
103
|
+
|
|
104
|
+
def validate(self, error_col_name: str) -> str | None:
|
|
105
|
+
"""
|
|
106
|
+
Checks if the URI exists and is accessible.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
str: The base URI if the container exists and is accessible, None otherwise.
|
|
110
|
+
"""
|
|
111
|
+
try:
|
|
112
|
+
container_client = self.client().get_container_client(self.container_name)
|
|
113
|
+
# Check if container exists by trying to get its properties
|
|
114
|
+
container_client.get_container_properties()
|
|
115
|
+
return self.__base_uri
|
|
116
|
+
except AzureError as e:
|
|
117
|
+
self.handle_azure_error(e, self.container_name, f'validate container {error_col_name}')
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
|
|
121
|
+
"""Copies a blob to a local file. Thread safe."""
|
|
122
|
+
try:
|
|
123
|
+
blob_client = self.client().get_blob_client(container=self.container_name, blob=self.prefix + src_path)
|
|
124
|
+
with open(dest_path, 'wb') as download_file:
|
|
125
|
+
download_stream = blob_client.download_blob()
|
|
126
|
+
download_file.write(download_stream.readall())
|
|
127
|
+
except AzureError as e:
|
|
128
|
+
self.handle_azure_error(e, self.container_name, f'download file {src_path}')
|
|
129
|
+
raise
|
|
130
|
+
|
|
131
|
+
# TODO: utils package should not include back-references to `Column`
|
|
132
|
+
def copy_local_file(self, col: 'Column', src_path: Path) -> str:
|
|
133
|
+
"""Copy a local file to Azure Blob Storage, and return its new URL"""
|
|
134
|
+
prefix, filename = ObjectPath.create_prefix_raw(
|
|
135
|
+
col.get_tbl().id, col.id, col.get_tbl().version, ext=src_path.suffix
|
|
136
|
+
)
|
|
137
|
+
blob_name = f'{self.prefix}{prefix}/{filename}'
|
|
138
|
+
new_file_uri = f'{self.__base_uri}{prefix}/{filename}'
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
blob_client = self.client().get_blob_client(container=self.container_name, blob=blob_name)
|
|
142
|
+
with open(src_path, 'rb') as data:
|
|
143
|
+
blob_client.upload_blob(data, overwrite=True)
|
|
144
|
+
_logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
|
|
145
|
+
return new_file_uri
|
|
146
|
+
except AzureError as e:
|
|
147
|
+
self.handle_azure_error(e, self.container_name, f'upload file {src_path}')
|
|
148
|
+
raise
|
|
149
|
+
|
|
150
|
+
def _get_filtered_blobs(
|
|
151
|
+
self, tbl_id: uuid.UUID | None, tbl_version: int | None = None
|
|
152
|
+
) -> Iterator['BlobProperties']:
|
|
153
|
+
"""Private method to get filtered blobs for a table, optionally filtered by version.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
tbl_id: Table UUID to filter by
|
|
157
|
+
tbl_version: Optional table version to filter by
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Iterator over blob objects matching the criteria
|
|
161
|
+
"""
|
|
162
|
+
# Use ObjectPath to construct the prefix for this table
|
|
163
|
+
if tbl_id is None:
|
|
164
|
+
prefix = self.prefix
|
|
165
|
+
assert tbl_version is None, 'tbl_version must be None if tbl_id is None'
|
|
166
|
+
else:
|
|
167
|
+
table_prefix = ObjectPath.table_prefix(tbl_id)
|
|
168
|
+
prefix = f'{self.prefix}{table_prefix}/'
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
container_client = self.client().get_container_client(self.container_name)
|
|
172
|
+
|
|
173
|
+
blob_iterator: Iterator['BlobProperties']
|
|
174
|
+
if tbl_version is None:
|
|
175
|
+
# Return all blobs with the table prefix
|
|
176
|
+
blob_iterator = container_client.list_blobs(name_starts_with=prefix)
|
|
177
|
+
else:
|
|
178
|
+
# Filter by both table_id and table_version using the ObjectPath pattern
|
|
179
|
+
# Pattern: tbl_id_col_id_version_uuid
|
|
180
|
+
version_pattern = re.compile(
|
|
181
|
+
rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*'
|
|
182
|
+
)
|
|
183
|
+
# Get all blobs with the prefix and filter by version pattern
|
|
184
|
+
all_blobs = container_client.list_blobs(name_starts_with=prefix)
|
|
185
|
+
blob_iterator = (blob for blob in all_blobs if version_pattern.match(blob.name.split('/')[-1]))
|
|
186
|
+
|
|
187
|
+
return blob_iterator
|
|
188
|
+
|
|
189
|
+
except AzureError as e:
|
|
190
|
+
self.handle_azure_error(e, self.container_name, f'setup iterator {self.prefix}')
|
|
191
|
+
raise
|
|
192
|
+
|
|
193
|
+
def count(self, tbl_id: uuid.UUID | None, tbl_version: int | None = None) -> int:
|
|
194
|
+
"""Count the number of files belonging to tbl_id. If tbl_version is not None,
|
|
195
|
+
count only those files belonging to the specified tbl_version.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
tbl_id: Table UUID to count blobs for
|
|
199
|
+
tbl_version: Optional table version to filter by
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Number of blobs matching the criteria
|
|
203
|
+
"""
|
|
204
|
+
blob_iterator = self._get_filtered_blobs(tbl_id, tbl_version)
|
|
205
|
+
return sum(1 for _ in blob_iterator)
|
|
206
|
+
|
|
207
|
+
def delete(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
|
|
208
|
+
"""Delete all files belonging to tbl_id. If tbl_version is not None, delete
|
|
209
|
+
only those files belonging to the specified tbl_version.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
tbl_id: Table UUID to delete blobs for
|
|
213
|
+
tbl_version: Optional table version to filter by
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Number of blobs deleted
|
|
217
|
+
"""
|
|
218
|
+
assert tbl_id is not None
|
|
219
|
+
blob_iterator = self._get_filtered_blobs(tbl_id, tbl_version)
|
|
220
|
+
total_deleted = 0
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
container_client = self.client().get_container_client(self.container_name)
|
|
224
|
+
|
|
225
|
+
for blob in blob_iterator:
|
|
226
|
+
# TODO: Figure out now to properly use batch method delete_blobs(), it doesn't seem to work properly
|
|
227
|
+
container_client.delete_blob(blob.name)
|
|
228
|
+
total_deleted += 1
|
|
229
|
+
|
|
230
|
+
# print(f"Deleted {total_deleted} blobs from container '{self.container_name}'.")
|
|
231
|
+
return total_deleted
|
|
232
|
+
|
|
233
|
+
except AzureError as e:
|
|
234
|
+
self.handle_azure_error(e, self.container_name, f'deleting with {self.prefix}')
|
|
235
|
+
raise
|
|
236
|
+
|
|
237
|
+
def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
|
|
238
|
+
"""Return a list of objects found in the specified destination bucket.
|
|
239
|
+
Each returned object includes the full set of prefixes.
|
|
240
|
+
if return_uri is True, full URI's are returned; otherwise, just the object keys.
|
|
241
|
+
"""
|
|
242
|
+
p = self.soa.prefix_free_uri if return_uri else ''
|
|
243
|
+
r: list[str] = []
|
|
244
|
+
try:
|
|
245
|
+
blob_iterator = self._get_filtered_blobs(tbl_id=None, tbl_version=None)
|
|
246
|
+
for blob in blob_iterator:
|
|
247
|
+
r.append(f'{p}{blob.name}')
|
|
248
|
+
if len(r) >= n_max:
|
|
249
|
+
return r
|
|
250
|
+
|
|
251
|
+
except AzureError as e:
|
|
252
|
+
self.handle_azure_error(e, self.__container_name, f'list objects from {self.__base_uri}')
|
|
253
|
+
return r
|
|
254
|
+
|
|
255
|
+
@classmethod
|
|
256
|
+
def handle_azure_error(
|
|
257
|
+
cls, e: 'AzureError', container_name: str, operation: str = '', *, ignore_404: bool = False
|
|
258
|
+
) -> None:
|
|
259
|
+
from azure.core.exceptions import ClientAuthenticationError, HttpResponseError, ResourceNotFoundError
|
|
260
|
+
|
|
261
|
+
if ignore_404 and isinstance(e, ResourceNotFoundError):
|
|
262
|
+
return
|
|
263
|
+
|
|
264
|
+
if isinstance(e, ResourceNotFoundError):
|
|
265
|
+
raise excs.Error(f'Container {container_name} or blob not found during {operation}: {str(e)!r}')
|
|
266
|
+
elif isinstance(e, ClientAuthenticationError):
|
|
267
|
+
raise excs.Error(f'Authentication failed for container {container_name} during {operation}: {str(e)!r}')
|
|
268
|
+
elif isinstance(e, HttpResponseError):
|
|
269
|
+
if e.status_code == 403:
|
|
270
|
+
raise excs.Error(f'Access denied to container {container_name} during {operation}: {str(e)!r}')
|
|
271
|
+
elif e.status_code == 412:
|
|
272
|
+
raise excs.Error(f'Precondition failed for container {container_name} during {operation}: {str(e)!r}')
|
|
273
|
+
else:
|
|
274
|
+
raise excs.Error(
|
|
275
|
+
f'HTTP error during {operation} in container {container_name}: {e.status_code} - {str(e)!r}'
|
|
276
|
+
)
|
|
277
|
+
else:
|
|
278
|
+
raise excs.Error(f'Error during {operation} in container {container_name}: {str(e)!r}')
|
|
279
|
+
|
|
280
|
+
@classmethod
|
|
281
|
+
def create_client(
|
|
282
|
+
cls, endpoint_url: str, account_name: str | None = None, account_key: str | None = None
|
|
283
|
+
) -> 'BlobServiceClient':
|
|
284
|
+
from azure.core.credentials import AzureNamedKeyCredential
|
|
285
|
+
from azure.storage.blob import BlobServiceClient # TODO: Use azure.storage.blob.aio instead
|
|
286
|
+
|
|
287
|
+
assert (account_name is None) == (account_key is None)
|
|
288
|
+
try:
|
|
289
|
+
# e.g. endpoint_url: str = f'https://{account_name}.blob.core.windows.net'
|
|
290
|
+
assert endpoint_url is not None, 'No Azure Storage account information provided'
|
|
291
|
+
|
|
292
|
+
# Use empty SAS token for anonymous authentication
|
|
293
|
+
credential = None
|
|
294
|
+
if account_name is not None:
|
|
295
|
+
credential = AzureNamedKeyCredential(name=account_name, key=account_key)
|
|
296
|
+
return BlobServiceClient(
|
|
297
|
+
account_url=endpoint_url,
|
|
298
|
+
credential=credential,
|
|
299
|
+
max_single_get_size=(32 * 2**20),
|
|
300
|
+
max_chunk_get_size=(4 * 2**20),
|
|
301
|
+
connection_timeout=15,
|
|
302
|
+
read_timeout=30,
|
|
303
|
+
)
|
|
304
|
+
except Exception as e:
|
|
305
|
+
raise excs.Error(f'Failed to create Azure Blob Storage client: {str(e)!r}') from e
|
pixeltable/utils/code.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import types
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
from pixeltable.func import Function
|
|
5
4
|
|
|
6
5
|
# Utilities related to the organization of the Pixeltable codebase.
|
|
7
6
|
|
|
8
7
|
|
|
9
|
-
def local_public_names(mod_name: str, exclude:
|
|
8
|
+
def local_public_names(mod_name: str, exclude: list[str] | None = None) -> list[str]:
|
|
10
9
|
"""
|
|
11
10
|
Returns a list of all functions and submodules that are local to the specified module and are
|
|
12
11
|
publicly accessible. Intended to facilitate implementation of module __dir__() methods for
|
pixeltable/utils/dbms.py
CHANGED
|
@@ -29,9 +29,7 @@ class Dbms(abc.ABC):
|
|
|
29
29
|
def default_system_db_url(self) -> str: ...
|
|
30
30
|
|
|
31
31
|
@abc.abstractmethod
|
|
32
|
-
def
|
|
33
|
-
self, index_name: str, index_value_sa_col: sql.schema.Column, conn: sql.Connection, metric: str
|
|
34
|
-
) -> None: ...
|
|
32
|
+
def sa_vector_index(self, store_index_name: str, sa_value_col: sql.schema.Column, metric: str) -> sql.Index: ...
|
|
35
33
|
|
|
36
34
|
|
|
37
35
|
class PostgresqlDbms(Dbms):
|
|
@@ -52,17 +50,14 @@ class PostgresqlDbms(Dbms):
|
|
|
52
50
|
a = self.db_url.set(database='postgres').render_as_string(hide_password=False)
|
|
53
51
|
return a
|
|
54
52
|
|
|
55
|
-
def
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
index_name,
|
|
60
|
-
index_value_sa_col,
|
|
53
|
+
def sa_vector_index(self, store_index_name: str, sa_value_col: sql.schema.Column, metric: str) -> sql.Index:
|
|
54
|
+
return sql.Index(
|
|
55
|
+
store_index_name,
|
|
56
|
+
sa_value_col,
|
|
61
57
|
postgresql_using='hnsw',
|
|
62
58
|
postgresql_with={'m': 16, 'ef_construction': 64},
|
|
63
|
-
postgresql_ops={
|
|
59
|
+
postgresql_ops={sa_value_col.name: metric},
|
|
64
60
|
)
|
|
65
|
-
idx.create(bind=conn)
|
|
66
61
|
|
|
67
62
|
|
|
68
63
|
class CockroachDbms(Dbms):
|
|
@@ -82,11 +77,12 @@ class CockroachDbms(Dbms):
|
|
|
82
77
|
def default_system_db_url(self) -> str:
|
|
83
78
|
return self.db_url.set(database='defaultdb').render_as_string(hide_password=False)
|
|
84
79
|
|
|
85
|
-
def
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
)
|
|
92
|
-
|
|
80
|
+
def sa_vector_index(self, store_index_name: str, sa_value_col: sql.schema.Column, metric: str) -> sql.Index:
|
|
81
|
+
# TODO: can the Create Index statement be generated via sqlalchemy?
|
|
82
|
+
# if not, change this method to create_vector_index_stmt(...) -> str
|
|
83
|
+
# original code:
|
|
84
|
+
# create_index_sql = sql.text(
|
|
85
|
+
# f"""CREATE VECTOR INDEX {store_index_name} ON {sa_value_col.table.name}
|
|
86
|
+
# ({sa_value_col.name} {metric})"""
|
|
87
|
+
# )
|
|
88
|
+
return None
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
import pandas as pd
|
|
5
4
|
from pandas.io.formats.style import Styler
|
|
@@ -11,7 +10,7 @@ class _Descriptor:
|
|
|
11
10
|
# The remaining fields only affect the behavior if `body` is a pd.DataFrame.
|
|
12
11
|
show_index: bool
|
|
13
12
|
show_header: bool
|
|
14
|
-
styler:
|
|
13
|
+
styler: Styler | None = None
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
class DescriptionHelper:
|
|
@@ -36,7 +35,7 @@ class DescriptionHelper:
|
|
|
36
35
|
descriptor: str | pd.DataFrame,
|
|
37
36
|
show_index: bool = False,
|
|
38
37
|
show_header: bool = True,
|
|
39
|
-
styler:
|
|
38
|
+
styler: Styler | None = None,
|
|
40
39
|
) -> None:
|
|
41
40
|
self.__descriptors.append(_Descriptor(descriptor, show_index, show_header, styler))
|
|
42
41
|
|
pixeltable/utils/documents.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import os
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
import bs4
|
|
6
5
|
import fitz # type: ignore[import-untyped]
|
|
@@ -13,10 +12,10 @@ from pixeltable.env import Env
|
|
|
13
12
|
@dataclasses.dataclass
|
|
14
13
|
class DocumentHandle:
|
|
15
14
|
format: ts.DocumentType.DocumentFormat
|
|
16
|
-
bs_doc:
|
|
17
|
-
md_ast:
|
|
18
|
-
pdf_doc:
|
|
19
|
-
txt_doc:
|
|
15
|
+
bs_doc: bs4.BeautifulSoup | None = None
|
|
16
|
+
md_ast: dict | None = None
|
|
17
|
+
pdf_doc: fitz.Document | None = None
|
|
18
|
+
txt_doc: str | None = None
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
def get_document_handle(path: str) -> DocumentHandle:
|
|
@@ -34,7 +33,7 @@ def get_document_handle(path: str) -> DocumentHandle:
|
|
|
34
33
|
raise excs.Error(f'Unrecognized document format: {path}')
|
|
35
34
|
|
|
36
35
|
|
|
37
|
-
def get_handle_by_extension(path: str, extension: str) ->
|
|
36
|
+
def get_handle_by_extension(path: str, extension: str) -> DocumentHandle | None:
|
|
38
37
|
doc_format = ts.DocumentType.DocumentFormat.from_extension(extension)
|
|
39
38
|
|
|
40
39
|
try:
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Callable,
|
|
2
|
+
from typing import Any, Callable, TypeVar
|
|
3
3
|
|
|
4
4
|
R = TypeVar('R')
|
|
5
5
|
|
|
6
6
|
logger = logging.getLogger('pixeltable')
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) ->
|
|
9
|
+
def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> R | None:
|
|
10
10
|
"""
|
|
11
11
|
Runs a cleanup function. If interrupted, retry cleanup.
|
|
12
12
|
The `run_cleanup()` function ensures that the `cleanup_func()` function executes at least once.
|
pixeltable/utils/filecache.py
CHANGED
|
@@ -9,7 +9,7 @@ from collections import OrderedDict, defaultdict
|
|
|
9
9
|
from dataclasses import dataclass
|
|
10
10
|
from datetime import datetime, timezone
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import NamedTuple
|
|
12
|
+
from typing import NamedTuple
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
15
|
import pixeltable.exceptions as excs
|
|
@@ -58,7 +58,7 @@ class FileCache:
|
|
|
58
58
|
- implement MRU eviction for queries that exceed the capacity
|
|
59
59
|
"""
|
|
60
60
|
|
|
61
|
-
__instance:
|
|
61
|
+
__instance: FileCache | None = None
|
|
62
62
|
|
|
63
63
|
cache: OrderedDict[str, CacheEntry]
|
|
64
64
|
total_size: int
|
|
@@ -126,12 +126,12 @@ class FileCache:
|
|
|
126
126
|
return 0
|
|
127
127
|
return int(self.total_size / len(self.cache))
|
|
128
128
|
|
|
129
|
-
def num_files(self, tbl_id:
|
|
129
|
+
def num_files(self, tbl_id: UUID | None = None) -> int:
|
|
130
130
|
if tbl_id is None:
|
|
131
131
|
return len(self.cache)
|
|
132
132
|
return sum(e.tbl_id == tbl_id for e in self.cache.values())
|
|
133
133
|
|
|
134
|
-
def clear(self, tbl_id:
|
|
134
|
+
def clear(self, tbl_id: UUID | None = None) -> None:
|
|
135
135
|
"""
|
|
136
136
|
For testing purposes: allow resetting capacity and stats.
|
|
137
137
|
"""
|
|
@@ -174,7 +174,7 @@ class FileCache:
|
|
|
174
174
|
h.update(url.encode())
|
|
175
175
|
return h.hexdigest()
|
|
176
176
|
|
|
177
|
-
def lookup(self, url: str) ->
|
|
177
|
+
def lookup(self, url: str) -> Path | None:
|
|
178
178
|
self.num_requests += 1
|
|
179
179
|
key = self._url_hash(url)
|
|
180
180
|
entry = self.cache.get(key, None)
|
pixeltable/utils/formatter.py
CHANGED
|
@@ -4,7 +4,7 @@ import io
|
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
6
|
import mimetypes
|
|
7
|
-
from typing import Any, Callable
|
|
7
|
+
from typing import Any, Callable
|
|
8
8
|
|
|
9
9
|
import av
|
|
10
10
|
import numpy as np
|
|
@@ -39,7 +39,7 @@ class Formatter:
|
|
|
39
39
|
self.__num_cols = num_cols
|
|
40
40
|
self.__http_address = http_address
|
|
41
41
|
|
|
42
|
-
def get_pandas_formatter(self, col_type: ts.ColumnType) ->
|
|
42
|
+
def get_pandas_formatter(self, col_type: ts.ColumnType) -> Callable | None:
|
|
43
43
|
if col_type.is_string_type():
|
|
44
44
|
return self.format_string
|
|
45
45
|
if col_type.is_float_type():
|
|
@@ -184,7 +184,7 @@ class Formatter:
|
|
|
184
184
|
"""
|
|
185
185
|
|
|
186
186
|
@classmethod
|
|
187
|
-
def extract_first_video_frame(cls, file_path: str) ->
|
|
187
|
+
def extract_first_video_frame(cls, file_path: str) -> Image.Image | None:
|
|
188
188
|
with av.open(file_path) as container:
|
|
189
189
|
try:
|
|
190
190
|
img = next(container.decode(video=0)).to_image()
|
|
@@ -224,9 +224,7 @@ class Formatter:
|
|
|
224
224
|
"""
|
|
225
225
|
|
|
226
226
|
@classmethod
|
|
227
|
-
def make_document_thumbnail(
|
|
228
|
-
cls, file_path: str, max_width: int = 320, max_height: int = 320
|
|
229
|
-
) -> Optional[Image.Image]:
|
|
227
|
+
def make_document_thumbnail(cls, file_path: str, max_width: int = 320, max_height: int = 320) -> Image.Image | None:
|
|
230
228
|
"""
|
|
231
229
|
Returns a thumbnail image of a document.
|
|
232
230
|
"""
|
pixeltable/utils/gcs_store.py
CHANGED
|
@@ -5,7 +5,7 @@ import re
|
|
|
5
5
|
import urllib.parse
|
|
6
6
|
import uuid
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING, Any, Iterator
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Iterator
|
|
9
9
|
|
|
10
10
|
from google.api_core.exceptions import GoogleAPIError
|
|
11
11
|
from google.cloud import storage # type: ignore[attr-defined]
|
|
@@ -81,7 +81,7 @@ class GCSStore(ObjectStoreBase):
|
|
|
81
81
|
"""Return the prefix from the base URI."""
|
|
82
82
|
return self.__prefix_name
|
|
83
83
|
|
|
84
|
-
def validate(self, error_col_name: str) ->
|
|
84
|
+
def validate(self, error_col_name: str) -> str | None:
|
|
85
85
|
"""
|
|
86
86
|
Checks if the URI exists.
|
|
87
87
|
|
|
@@ -99,7 +99,7 @@ class GCSStore(ObjectStoreBase):
|
|
|
99
99
|
self.handle_gcs_error(e, self.bucket_name, f'validate bucket {error_col_name}')
|
|
100
100
|
return None
|
|
101
101
|
|
|
102
|
-
def _prepare_uri_raw(self, tbl_id: uuid.UUID, col_id: int, tbl_version: int, ext:
|
|
102
|
+
def _prepare_uri_raw(self, tbl_id: uuid.UUID, col_id: int, tbl_version: int, ext: str | None = None) -> str:
|
|
103
103
|
"""
|
|
104
104
|
Construct a new, unique URI for a persisted media file.
|
|
105
105
|
"""
|
|
@@ -107,12 +107,12 @@ class GCSStore(ObjectStoreBase):
|
|
|
107
107
|
parent = f'{self.__base_uri}{prefix}'
|
|
108
108
|
return f'{parent}/{filename}'
|
|
109
109
|
|
|
110
|
-
def _prepare_uri(self, col: Column, ext:
|
|
110
|
+
def _prepare_uri(self, col: Column, ext: str | None = None) -> str:
|
|
111
111
|
"""
|
|
112
112
|
Construct a new, unique URI for a persisted media file.
|
|
113
113
|
"""
|
|
114
|
-
assert col.
|
|
115
|
-
return self._prepare_uri_raw(col.
|
|
114
|
+
assert col.get_tbl() is not None, 'Column must be associated with a table'
|
|
115
|
+
return self._prepare_uri_raw(col.get_tbl().id, col.id, col.get_tbl().version, ext=ext)
|
|
116
116
|
|
|
117
117
|
def copy_local_file(self, col: Column, src_path: Path) -> str:
|
|
118
118
|
"""Copy a local file, and return its new URL"""
|
|
@@ -142,7 +142,7 @@ class GCSStore(ObjectStoreBase):
|
|
|
142
142
|
self.handle_gcs_error(e, self.bucket_name, f'download file {src_path}')
|
|
143
143
|
raise
|
|
144
144
|
|
|
145
|
-
def _get_filtered_objects(self, bucket: Any, tbl_id: uuid.UUID, tbl_version:
|
|
145
|
+
def _get_filtered_objects(self, bucket: Any, tbl_id: uuid.UUID, tbl_version: int | None = None) -> Iterator:
|
|
146
146
|
"""Private method to get filtered objects for a table, optionally filtered by version.
|
|
147
147
|
|
|
148
148
|
Args:
|
|
@@ -168,7 +168,7 @@ class GCSStore(ObjectStoreBase):
|
|
|
168
168
|
|
|
169
169
|
return blob_iterator
|
|
170
170
|
|
|
171
|
-
def count(self, tbl_id: uuid.UUID, tbl_version:
|
|
171
|
+
def count(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
|
|
172
172
|
"""Count the number of files belonging to tbl_id. If tbl_version is not None,
|
|
173
173
|
count only those files belonging to the specified tbl_version.
|
|
174
174
|
|
|
@@ -193,7 +193,7 @@ class GCSStore(ObjectStoreBase):
|
|
|
193
193
|
self.handle_gcs_error(e, self.bucket_name, f'setup iterator {self.prefix}')
|
|
194
194
|
raise
|
|
195
195
|
|
|
196
|
-
def delete(self, tbl_id: uuid.UUID, tbl_version:
|
|
196
|
+
def delete(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
|
|
197
197
|
"""Delete all files belonging to tbl_id. If tbl_version is not None, delete
|
|
198
198
|
only those files belonging to the specified tbl_version.
|
|
199
199
|
|