pixeltable 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/_version.py +1 -0
- pixeltable/catalog/catalog.py +144 -118
- pixeltable/catalog/column.py +104 -115
- pixeltable/catalog/globals.py +1 -2
- pixeltable/catalog/insertable_table.py +44 -49
- pixeltable/catalog/path.py +3 -4
- pixeltable/catalog/schema_object.py +4 -4
- pixeltable/catalog/table.py +139 -124
- pixeltable/catalog/table_metadata.py +6 -6
- pixeltable/catalog/table_version.py +315 -246
- pixeltable/catalog/table_version_handle.py +4 -4
- pixeltable/catalog/table_version_path.py +9 -10
- pixeltable/catalog/tbl_ops.py +9 -3
- pixeltable/catalog/view.py +34 -28
- pixeltable/config.py +14 -10
- pixeltable/dataframe.py +69 -78
- pixeltable/env.py +78 -64
- pixeltable/exec/aggregation_node.py +6 -6
- pixeltable/exec/cache_prefetch_node.py +10 -10
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +5 -5
- pixeltable/exec/expr_eval/evaluators.py +6 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
- pixeltable/exec/expr_eval/globals.py +6 -6
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +11 -11
- pixeltable/exec/in_memory_data_node.py +2 -2
- pixeltable/exec/object_store_save_node.py +14 -17
- pixeltable/exec/sql_node.py +28 -27
- pixeltable/exprs/arithmetic_expr.py +4 -4
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/column_property_ref.py +3 -3
- pixeltable/exprs/column_ref.py +61 -74
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +3 -3
- pixeltable/exprs/data_row.py +12 -12
- pixeltable/exprs/expr.py +41 -31
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +3 -3
- pixeltable/exprs/function_call.py +14 -14
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +8 -8
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +6 -6
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +14 -14
- pixeltable/exprs/rowid_ref.py +8 -8
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +2 -2
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +3 -3
- pixeltable/func/function.py +15 -17
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +2 -2
- pixeltable/func/query_template_function.py +16 -16
- pixeltable/func/signature.py +14 -14
- pixeltable/func/tools.py +11 -11
- pixeltable/func/udf.py +16 -18
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +7 -7
- pixeltable/functions/audio.py +76 -0
- pixeltable/functions/bedrock.py +6 -6
- pixeltable/functions/deepseek.py +4 -4
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +6 -6
- pixeltable/functions/globals.py +12 -12
- pixeltable/functions/groq.py +4 -4
- pixeltable/functions/huggingface.py +1033 -6
- pixeltable/functions/image.py +7 -10
- pixeltable/functions/llama_cpp.py +7 -7
- pixeltable/functions/math.py +2 -3
- pixeltable/functions/mistralai.py +3 -3
- pixeltable/functions/ollama.py +9 -9
- pixeltable/functions/openai.py +21 -21
- pixeltable/functions/openrouter.py +7 -7
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +7 -8
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/video.py +36 -31
- pixeltable/functions/vision.py +6 -6
- pixeltable/functions/whisper.py +7 -7
- pixeltable/functions/whisperx.py +16 -16
- pixeltable/globals.py +75 -40
- pixeltable/index/base.py +12 -8
- pixeltable/index/btree.py +19 -22
- pixeltable/index/embedding_index.py +30 -39
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/external_store.py +13 -16
- pixeltable/io/fiftyone.py +5 -5
- pixeltable/io/globals.py +5 -5
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +12 -12
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +12 -12
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/audio.py +2 -2
- pixeltable/iterators/document.py +88 -57
- pixeltable/iterators/video.py +66 -37
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_38.py +2 -2
- pixeltable/metadata/converters/convert_39.py +1 -2
- pixeltable/metadata/converters/util.py +11 -13
- pixeltable/metadata/schema.py +22 -21
- pixeltable/metadata/utils.py +2 -6
- pixeltable/mypy/mypy_plugin.py +5 -5
- pixeltable/plan.py +32 -34
- pixeltable/share/packager.py +7 -7
- pixeltable/share/publish.py +3 -3
- pixeltable/store.py +126 -41
- pixeltable/type_system.py +43 -46
- pixeltable/utils/__init__.py +1 -2
- pixeltable/utils/arrow.py +4 -4
- pixeltable/utils/av.py +74 -38
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +1 -2
- pixeltable/utils/dbms.py +15 -19
- pixeltable/utils/description_helper.py +2 -3
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +2 -2
- pixeltable/utils/filecache.py +5 -5
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +9 -9
- pixeltable/utils/local_store.py +17 -17
- pixeltable/utils/object_stores.py +59 -43
- pixeltable/utils/s3_store.py +35 -30
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/METADATA +4 -4
- pixeltable-0.4.19.dist-info/RECORD +213 -0
- pixeltable/__version__.py +0 -3
- pixeltable-0.4.17.dist-info/RECORD +0 -211
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
pixeltable/utils/formatter.py
CHANGED
|
@@ -4,7 +4,7 @@ import io
|
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
6
|
import mimetypes
|
|
7
|
-
from typing import Any, Callable
|
|
7
|
+
from typing import Any, Callable
|
|
8
8
|
|
|
9
9
|
import av
|
|
10
10
|
import numpy as np
|
|
@@ -39,7 +39,7 @@ class Formatter:
|
|
|
39
39
|
self.__num_cols = num_cols
|
|
40
40
|
self.__http_address = http_address
|
|
41
41
|
|
|
42
|
-
def get_pandas_formatter(self, col_type: ts.ColumnType) ->
|
|
42
|
+
def get_pandas_formatter(self, col_type: ts.ColumnType) -> Callable | None:
|
|
43
43
|
if col_type.is_string_type():
|
|
44
44
|
return self.format_string
|
|
45
45
|
if col_type.is_float_type():
|
|
@@ -184,7 +184,7 @@ class Formatter:
|
|
|
184
184
|
"""
|
|
185
185
|
|
|
186
186
|
@classmethod
|
|
187
|
-
def extract_first_video_frame(cls, file_path: str) ->
|
|
187
|
+
def extract_first_video_frame(cls, file_path: str) -> Image.Image | None:
|
|
188
188
|
with av.open(file_path) as container:
|
|
189
189
|
try:
|
|
190
190
|
img = next(container.decode(video=0)).to_image()
|
|
@@ -224,9 +224,7 @@ class Formatter:
|
|
|
224
224
|
"""
|
|
225
225
|
|
|
226
226
|
@classmethod
|
|
227
|
-
def make_document_thumbnail(
|
|
228
|
-
cls, file_path: str, max_width: int = 320, max_height: int = 320
|
|
229
|
-
) -> Optional[Image.Image]:
|
|
227
|
+
def make_document_thumbnail(cls, file_path: str, max_width: int = 320, max_height: int = 320) -> Image.Image | None:
|
|
230
228
|
"""
|
|
231
229
|
Returns a thumbnail image of a document.
|
|
232
230
|
"""
|
pixeltable/utils/gcs_store.py
CHANGED
|
@@ -5,7 +5,7 @@ import re
|
|
|
5
5
|
import urllib.parse
|
|
6
6
|
import uuid
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING, Any, Iterator
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Iterator
|
|
9
9
|
|
|
10
10
|
from google.api_core.exceptions import GoogleAPIError
|
|
11
11
|
from google.cloud import storage # type: ignore[attr-defined]
|
|
@@ -81,7 +81,7 @@ class GCSStore(ObjectStoreBase):
|
|
|
81
81
|
"""Return the prefix from the base URI."""
|
|
82
82
|
return self.__prefix_name
|
|
83
83
|
|
|
84
|
-
def validate(self, error_col_name: str) ->
|
|
84
|
+
def validate(self, error_col_name: str) -> str | None:
|
|
85
85
|
"""
|
|
86
86
|
Checks if the URI exists.
|
|
87
87
|
|
|
@@ -99,7 +99,7 @@ class GCSStore(ObjectStoreBase):
|
|
|
99
99
|
self.handle_gcs_error(e, self.bucket_name, f'validate bucket {error_col_name}')
|
|
100
100
|
return None
|
|
101
101
|
|
|
102
|
-
def _prepare_uri_raw(self, tbl_id: uuid.UUID, col_id: int, tbl_version: int, ext:
|
|
102
|
+
def _prepare_uri_raw(self, tbl_id: uuid.UUID, col_id: int, tbl_version: int, ext: str | None = None) -> str:
|
|
103
103
|
"""
|
|
104
104
|
Construct a new, unique URI for a persisted media file.
|
|
105
105
|
"""
|
|
@@ -107,12 +107,12 @@ class GCSStore(ObjectStoreBase):
|
|
|
107
107
|
parent = f'{self.__base_uri}{prefix}'
|
|
108
108
|
return f'{parent}/{filename}'
|
|
109
109
|
|
|
110
|
-
def _prepare_uri(self, col: Column, ext:
|
|
110
|
+
def _prepare_uri(self, col: Column, ext: str | None = None) -> str:
|
|
111
111
|
"""
|
|
112
112
|
Construct a new, unique URI for a persisted media file.
|
|
113
113
|
"""
|
|
114
|
-
assert col.
|
|
115
|
-
return self._prepare_uri_raw(col.
|
|
114
|
+
assert col.get_tbl() is not None, 'Column must be associated with a table'
|
|
115
|
+
return self._prepare_uri_raw(col.get_tbl().id, col.id, col.get_tbl().version, ext=ext)
|
|
116
116
|
|
|
117
117
|
def copy_local_file(self, col: Column, src_path: Path) -> str:
|
|
118
118
|
"""Copy a local file, and return its new URL"""
|
|
@@ -142,7 +142,7 @@ class GCSStore(ObjectStoreBase):
|
|
|
142
142
|
self.handle_gcs_error(e, self.bucket_name, f'download file {src_path}')
|
|
143
143
|
raise
|
|
144
144
|
|
|
145
|
-
def _get_filtered_objects(self, bucket: Any, tbl_id: uuid.UUID, tbl_version:
|
|
145
|
+
def _get_filtered_objects(self, bucket: Any, tbl_id: uuid.UUID, tbl_version: int | None = None) -> Iterator:
|
|
146
146
|
"""Private method to get filtered objects for a table, optionally filtered by version.
|
|
147
147
|
|
|
148
148
|
Args:
|
|
@@ -168,7 +168,7 @@ class GCSStore(ObjectStoreBase):
|
|
|
168
168
|
|
|
169
169
|
return blob_iterator
|
|
170
170
|
|
|
171
|
-
def count(self, tbl_id: uuid.UUID, tbl_version:
|
|
171
|
+
def count(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
|
|
172
172
|
"""Count the number of files belonging to tbl_id. If tbl_version is not None,
|
|
173
173
|
count only those files belonging to the specified tbl_version.
|
|
174
174
|
|
|
@@ -193,7 +193,7 @@ class GCSStore(ObjectStoreBase):
|
|
|
193
193
|
self.handle_gcs_error(e, self.bucket_name, f'setup iterator {self.prefix}')
|
|
194
194
|
raise
|
|
195
195
|
|
|
196
|
-
def delete(self, tbl_id: uuid.UUID, tbl_version:
|
|
196
|
+
def delete(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
|
|
197
197
|
"""Delete all files belonging to tbl_id. If tbl_version is not None, delete
|
|
198
198
|
only those files belonging to the specified tbl_version.
|
|
199
199
|
|
pixeltable/utils/local_store.py
CHANGED
|
@@ -10,7 +10,7 @@ import urllib.request
|
|
|
10
10
|
import uuid
|
|
11
11
|
from collections import defaultdict
|
|
12
12
|
from pathlib import Path
|
|
13
|
-
from typing import TYPE_CHECKING
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
14
|
from uuid import UUID
|
|
15
15
|
|
|
16
16
|
import PIL.Image
|
|
@@ -35,7 +35,7 @@ class LocalStore(ObjectStoreBase):
|
|
|
35
35
|
|
|
36
36
|
__base_dir: Path
|
|
37
37
|
|
|
38
|
-
soa:
|
|
38
|
+
soa: StorageObjectAddress | None
|
|
39
39
|
|
|
40
40
|
def __init__(self, location: Path | StorageObjectAddress):
|
|
41
41
|
if isinstance(location, Path):
|
|
@@ -69,7 +69,7 @@ class LocalStore(ObjectStoreBase):
|
|
|
69
69
|
raise excs.Error(f'{error_col_name}`destination` must be a valid path. Error: {e}') from None
|
|
70
70
|
|
|
71
71
|
@staticmethod
|
|
72
|
-
def file_url_to_path(url: str) ->
|
|
72
|
+
def file_url_to_path(url: str) -> Path | None:
|
|
73
73
|
"""Convert a file:// URI to a Path object with support for Windows UNC paths."""
|
|
74
74
|
assert isinstance(url, str), type(url)
|
|
75
75
|
parsed = urllib.parse.urlparse(url)
|
|
@@ -90,7 +90,7 @@ class LocalStore(ObjectStoreBase):
|
|
|
90
90
|
return Path(path_str)
|
|
91
91
|
|
|
92
92
|
@classmethod
|
|
93
|
-
def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format:
|
|
93
|
+
def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: str | None) -> Path:
|
|
94
94
|
"""Save binary data to a file in a LocalStore. format is ignored for binary data."""
|
|
95
95
|
assert isinstance(file_data, bytes)
|
|
96
96
|
with open(dest_path, 'wb') as f:
|
|
@@ -100,7 +100,7 @@ class LocalStore(ObjectStoreBase):
|
|
|
100
100
|
return dest_path
|
|
101
101
|
|
|
102
102
|
@classmethod
|
|
103
|
-
def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format:
|
|
103
|
+
def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: str | None) -> Path:
|
|
104
104
|
"""Save a PIL Image to a file in a LocalStore with the specified format."""
|
|
105
105
|
if dest_path.suffix != f'.{format}':
|
|
106
106
|
dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
|
|
@@ -111,7 +111,7 @@ class LocalStore(ObjectStoreBase):
|
|
|
111
111
|
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
112
112
|
return dest_path
|
|
113
113
|
|
|
114
|
-
def _prepare_path_raw(self, tbl_id: UUID, col_id: int, tbl_version: int, ext:
|
|
114
|
+
def _prepare_path_raw(self, tbl_id: UUID, col_id: int, tbl_version: int, ext: str | None = None) -> Path:
|
|
115
115
|
"""
|
|
116
116
|
Construct a new, unique Path name in the __base_dir for a persisted file.
|
|
117
117
|
Create the parent directory for the new Path if it does not already exist.
|
|
@@ -121,19 +121,19 @@ class LocalStore(ObjectStoreBase):
|
|
|
121
121
|
parent.mkdir(parents=True, exist_ok=True)
|
|
122
122
|
return parent / filename
|
|
123
123
|
|
|
124
|
-
def _prepare_path(self, col: Column, ext:
|
|
124
|
+
def _prepare_path(self, col: Column, ext: str | None = None) -> Path:
|
|
125
125
|
"""
|
|
126
126
|
Construct a new, unique Path name in the __base_dir for a persisted file.
|
|
127
127
|
Create the parent directory for the new Path if it does not already exist.
|
|
128
128
|
"""
|
|
129
|
-
assert col.
|
|
130
|
-
return self._prepare_path_raw(col.
|
|
129
|
+
assert col.get_tbl() is not None, 'Column must be associated with a table'
|
|
130
|
+
return self._prepare_path_raw(col.get_tbl().id, col.id, col.get_tbl().version, ext)
|
|
131
131
|
|
|
132
132
|
def contains_path(self, file_path: Path) -> bool:
|
|
133
133
|
"""Return True if the given path refers to a file managed by this LocalStore, else False."""
|
|
134
134
|
return str(file_path).startswith(str(self.__base_dir))
|
|
135
135
|
|
|
136
|
-
def resolve_url(self, file_url:
|
|
136
|
+
def resolve_url(self, file_url: str | None) -> Path | None:
|
|
137
137
|
"""Return path if the given url refers to a file managed by this LocalStore, else None.
|
|
138
138
|
|
|
139
139
|
Args:
|
|
@@ -168,7 +168,7 @@ class LocalStore(ObjectStoreBase):
|
|
|
168
168
|
_logger.debug(f'Media Storage: copied {src_path} to {new_file_url}')
|
|
169
169
|
return new_file_url
|
|
170
170
|
|
|
171
|
-
def save_media_object(self, data: bytes | PIL.Image.Image, col: Column, format:
|
|
171
|
+
def save_media_object(self, data: bytes | PIL.Image.Image, col: Column, format: str | None) -> tuple[Path, str]:
|
|
172
172
|
"""Save a data object to a file in a LocalStore
|
|
173
173
|
Returns:
|
|
174
174
|
dest_path: Path to the saved file
|
|
@@ -185,7 +185,7 @@ class LocalStore(ObjectStoreBase):
|
|
|
185
185
|
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
186
186
|
return dest_path, new_file_url
|
|
187
187
|
|
|
188
|
-
def delete(self, tbl_id: UUID, tbl_version:
|
|
188
|
+
def delete(self, tbl_id: UUID, tbl_version: int | None = None) -> int | None:
|
|
189
189
|
"""Delete all files belonging to tbl_id. If tbl_version is not None, delete
|
|
190
190
|
only those files belonging to the specified tbl_version.
|
|
191
191
|
|
|
@@ -209,7 +209,7 @@ class LocalStore(ObjectStoreBase):
|
|
|
209
209
|
os.remove(p)
|
|
210
210
|
return len(paths)
|
|
211
211
|
|
|
212
|
-
def count(self, tbl_id:
|
|
212
|
+
def count(self, tbl_id: UUID | None, tbl_version: int | None = None) -> int:
|
|
213
213
|
"""
|
|
214
214
|
Return number of files for given tbl_id.
|
|
215
215
|
"""
|
|
@@ -277,7 +277,7 @@ class TempStore:
|
|
|
277
277
|
return env.Env.get().tmp_dir
|
|
278
278
|
|
|
279
279
|
@classmethod
|
|
280
|
-
def count(cls, tbl_id:
|
|
280
|
+
def count(cls, tbl_id: UUID | None = None, tbl_version: int | None = None) -> int:
|
|
281
281
|
return LocalStore(cls._tmp_dir()).count(tbl_id, tbl_version)
|
|
282
282
|
|
|
283
283
|
@classmethod
|
|
@@ -285,11 +285,11 @@ class TempStore:
|
|
|
285
285
|
return LocalStore(cls._tmp_dir()).contains_path(file_path)
|
|
286
286
|
|
|
287
287
|
@classmethod
|
|
288
|
-
def resolve_url(cls, file_url:
|
|
288
|
+
def resolve_url(cls, file_url: str | None) -> Path | None:
|
|
289
289
|
return LocalStore(cls._tmp_dir()).resolve_url(file_url)
|
|
290
290
|
|
|
291
291
|
@classmethod
|
|
292
|
-
def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format:
|
|
292
|
+
def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: str | None) -> tuple[Path, str]:
|
|
293
293
|
return LocalStore(cls._tmp_dir()).save_media_object(data, col, format)
|
|
294
294
|
|
|
295
295
|
@classmethod
|
|
@@ -302,7 +302,7 @@ class TempStore:
|
|
|
302
302
|
_logger.debug(f'Media Storage: deleted {file_path}')
|
|
303
303
|
|
|
304
304
|
@classmethod
|
|
305
|
-
def create_path(cls, tbl_id:
|
|
305
|
+
def create_path(cls, tbl_id: UUID | None = None, extension: str = '') -> Path:
|
|
306
306
|
"""Return a new, unique Path located in the temporary store.
|
|
307
307
|
If tbl_id is provided, the path name will be similar to a LocalStore path based on the tbl_id.
|
|
308
308
|
If tbl_id is None, a random UUID will be used to create the path."""
|
|
@@ -7,7 +7,7 @@ import urllib.parse
|
|
|
7
7
|
import urllib.request
|
|
8
8
|
import uuid
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import TYPE_CHECKING,
|
|
10
|
+
from typing import TYPE_CHECKING, NamedTuple
|
|
11
11
|
from uuid import UUID
|
|
12
12
|
|
|
13
13
|
from pixeltable import env, exceptions as excs
|
|
@@ -44,7 +44,7 @@ class StorageObjectAddress(NamedTuple):
|
|
|
44
44
|
key: str = '' # Key parsed from the source (prefix + object_name)
|
|
45
45
|
prefix: str = '' # Prefix (within the bucket) parsed from the source
|
|
46
46
|
object_name: str = '' # Object name parsed from the source (if requested and applicable)
|
|
47
|
-
path:
|
|
47
|
+
path: Path | None = None
|
|
48
48
|
|
|
49
49
|
@property
|
|
50
50
|
def has_object(self) -> bool:
|
|
@@ -56,11 +56,11 @@ class StorageObjectAddress(NamedTuple):
|
|
|
56
56
|
|
|
57
57
|
@property
|
|
58
58
|
def is_azure_scheme(self) -> bool:
|
|
59
|
-
return self.scheme in
|
|
59
|
+
return self.scheme in ('wasb', 'wasbs', 'abfs', 'abfss')
|
|
60
60
|
|
|
61
61
|
@property
|
|
62
62
|
def has_valid_storage_target(self) -> bool:
|
|
63
|
-
return self.storage_target in
|
|
63
|
+
return self.storage_target in (
|
|
64
64
|
StorageTarget.LOCAL_STORE,
|
|
65
65
|
StorageTarget.S3_STORE,
|
|
66
66
|
StorageTarget.R2_STORE,
|
|
@@ -68,7 +68,7 @@ class StorageObjectAddress(NamedTuple):
|
|
|
68
68
|
StorageTarget.GCS_STORE,
|
|
69
69
|
StorageTarget.AZURE_STORE,
|
|
70
70
|
StorageTarget.HTTP_STORE,
|
|
71
|
-
|
|
71
|
+
)
|
|
72
72
|
|
|
73
73
|
@property
|
|
74
74
|
def prefix_free_uri(self) -> str:
|
|
@@ -120,9 +120,7 @@ class ObjectPath:
|
|
|
120
120
|
return tbl_id.hex
|
|
121
121
|
|
|
122
122
|
@classmethod
|
|
123
|
-
def create_prefix_raw(
|
|
124
|
-
cls, tbl_id: UUID, col_id: int, tbl_version: int, ext: Optional[str] = None
|
|
125
|
-
) -> tuple[str, str]:
|
|
123
|
+
def create_prefix_raw(cls, tbl_id: UUID, col_id: int, tbl_version: int, ext: str | None = None) -> tuple[str, str]:
|
|
126
124
|
"""Construct a unique unix-style prefix and filename for a persisted file.
|
|
127
125
|
The results are derived from table, col, and version specs.
|
|
128
126
|
Returns:
|
|
@@ -202,7 +200,7 @@ class ObjectPath:
|
|
|
202
200
|
container = parsed.netloc
|
|
203
201
|
key = parsed.path.lstrip('/')
|
|
204
202
|
|
|
205
|
-
elif scheme in
|
|
203
|
+
elif scheme in ('wasb', 'wasbs', 'abfs', 'abfss'):
|
|
206
204
|
# Azure-specific URI schemes
|
|
207
205
|
# wasb[s]://container@account.blob.core.windows.net/<optional prefix>/<optional object>
|
|
208
206
|
# abfs[s]://container@account.dfs.core.windows.net/<optional prefix>/<optional object>
|
|
@@ -216,7 +214,7 @@ class ObjectPath:
|
|
|
216
214
|
raise ValueError(f'Invalid Azure URI format: {src_addr}')
|
|
217
215
|
key = parsed.path.lstrip('/')
|
|
218
216
|
|
|
219
|
-
elif scheme in
|
|
217
|
+
elif scheme in ('http', 'https'):
|
|
220
218
|
# Standard HTTP(S) URL format
|
|
221
219
|
# https://account.blob.core.windows.net/container/<optional path>/<optional object>
|
|
222
220
|
# https://account.r2.cloudflarestorage.com/container/<optional path>/<optional object>
|
|
@@ -253,7 +251,7 @@ class ObjectPath:
|
|
|
253
251
|
return r
|
|
254
252
|
|
|
255
253
|
@classmethod
|
|
256
|
-
def parse_object_storage_addr(cls, src_addr: str,
|
|
254
|
+
def parse_object_storage_addr(cls, src_addr: str, allow_obj_name: bool) -> StorageObjectAddress:
|
|
257
255
|
"""
|
|
258
256
|
Parses a cloud storage URI into its scheme, bucket, prefix, and object name.
|
|
259
257
|
|
|
@@ -273,14 +271,14 @@ class ObjectPath:
|
|
|
273
271
|
https://raw.github.com/pixeltable/pixeltable/main/docs/resources/images/000000000030.jpg
|
|
274
272
|
"""
|
|
275
273
|
soa = cls.parse_object_storage_addr1(src_addr)
|
|
276
|
-
prefix, object_name = cls.separate_prefix_object(soa.key,
|
|
274
|
+
prefix, object_name = cls.separate_prefix_object(soa.key, allow_obj_name)
|
|
277
275
|
assert not object_name.endswith('/')
|
|
278
276
|
r = soa._replace(prefix=prefix, object_name=object_name)
|
|
279
277
|
return r
|
|
280
278
|
|
|
281
279
|
|
|
282
280
|
class ObjectStoreBase:
|
|
283
|
-
def validate(self,
|
|
281
|
+
def validate(self, error_prefix: str) -> str | None:
|
|
284
282
|
"""Check the store configuration. Returns base URI if store is accessible.
|
|
285
283
|
|
|
286
284
|
Args:
|
|
@@ -303,7 +301,7 @@ class ObjectStoreBase:
|
|
|
303
301
|
"""
|
|
304
302
|
raise AssertionError
|
|
305
303
|
|
|
306
|
-
def move_local_file(self, col: Column, src_path: Path) ->
|
|
304
|
+
def move_local_file(self, col: Column, src_path: Path) -> str | None:
|
|
307
305
|
"""Move a file associated with a Column to the store, returning the file's URL within the destination.
|
|
308
306
|
|
|
309
307
|
Args:
|
|
@@ -324,7 +322,7 @@ class ObjectStoreBase:
|
|
|
324
322
|
"""
|
|
325
323
|
raise AssertionError
|
|
326
324
|
|
|
327
|
-
def count(self, tbl_id: UUID, tbl_version:
|
|
325
|
+
def count(self, tbl_id: UUID, tbl_version: int | None = None) -> int:
|
|
328
326
|
"""Return the number of objects in the store associated with the given tbl_id
|
|
329
327
|
|
|
330
328
|
Args:
|
|
@@ -336,7 +334,7 @@ class ObjectStoreBase:
|
|
|
336
334
|
"""
|
|
337
335
|
raise AssertionError
|
|
338
336
|
|
|
339
|
-
def delete(self, tbl_id: UUID, tbl_version:
|
|
337
|
+
def delete(self, tbl_id: UUID, tbl_version: int | None = None) -> int | None:
|
|
340
338
|
"""Delete objects in the destination for a given table ID, table version.
|
|
341
339
|
|
|
342
340
|
Args:
|
|
@@ -360,28 +358,15 @@ class ObjectStoreBase:
|
|
|
360
358
|
|
|
361
359
|
class ObjectOps:
|
|
362
360
|
@classmethod
|
|
363
|
-
def get_store(cls, dest:
|
|
361
|
+
def get_store(cls, dest: str | None, allow_obj_name: bool, col_name: str | None = None) -> ObjectStoreBase:
|
|
364
362
|
from pixeltable.env import Env
|
|
365
363
|
from pixeltable.utils.local_store import LocalStore
|
|
366
364
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
if dest is None
|
|
370
|
-
else ObjectPath.parse_object_storage_addr(dest, may_contain_object_name=may_contain_object_name)
|
|
371
|
-
)
|
|
365
|
+
dest = dest or str(Env.get().media_dir) # Use local media dir as fallback
|
|
366
|
+
soa = ObjectPath.parse_object_storage_addr(dest, allow_obj_name=allow_obj_name)
|
|
372
367
|
if soa.storage_target == StorageTarget.LOCAL_STORE:
|
|
373
368
|
return LocalStore(soa)
|
|
374
|
-
if soa.storage_target
|
|
375
|
-
env.Env.get().require_package('boto3')
|
|
376
|
-
from pixeltable.utils.s3_store import S3Store
|
|
377
|
-
|
|
378
|
-
return S3Store(soa)
|
|
379
|
-
if soa.storage_target == StorageTarget.R2_STORE:
|
|
380
|
-
env.Env.get().require_package('boto3')
|
|
381
|
-
from pixeltable.utils.s3_store import S3Store
|
|
382
|
-
|
|
383
|
-
return S3Store(soa)
|
|
384
|
-
if soa.storage_target == StorageTarget.B2_STORE:
|
|
369
|
+
if soa.storage_target in (StorageTarget.S3_STORE, StorageTarget.R2_STORE, StorageTarget.B2_STORE):
|
|
385
370
|
env.Env.get().require_package('boto3')
|
|
386
371
|
from pixeltable.utils.s3_store import S3Store
|
|
387
372
|
|
|
@@ -391,6 +376,11 @@ class ObjectOps:
|
|
|
391
376
|
from pixeltable.utils.gcs_store import GCSStore
|
|
392
377
|
|
|
393
378
|
return GCSStore(soa)
|
|
379
|
+
if soa.storage_target == StorageTarget.AZURE_STORE:
|
|
380
|
+
env.Env.get().require_package('azure.storage.blob')
|
|
381
|
+
from pixeltable.utils.azure_store import AzureBlobStore
|
|
382
|
+
|
|
383
|
+
return AzureBlobStore(soa)
|
|
394
384
|
if soa.storage_target == StorageTarget.HTTP_STORE and soa.is_http_readable:
|
|
395
385
|
return HTTPStore(soa)
|
|
396
386
|
error_col_name = f'Column {col_name!r}: ' if col_name is not None else ''
|
|
@@ -399,7 +389,7 @@ class ObjectOps:
|
|
|
399
389
|
)
|
|
400
390
|
|
|
401
391
|
@classmethod
|
|
402
|
-
def validate_destination(cls, dest: str | Path | None, col_name:
|
|
392
|
+
def validate_destination(cls, dest: str | Path | None, col_name: str | None = None) -> str:
|
|
403
393
|
"""Convert a Column destination parameter to a URI, else raise errors.
|
|
404
394
|
Args:
|
|
405
395
|
dest: The requested destination
|
|
@@ -407,19 +397,19 @@ class ObjectOps:
|
|
|
407
397
|
Returns:
|
|
408
398
|
URI of destination, or raises an error
|
|
409
399
|
"""
|
|
410
|
-
|
|
400
|
+
error_col_str = f'column {col_name!r}' if col_name is not None else ''
|
|
411
401
|
|
|
412
402
|
# General checks on any destination
|
|
413
403
|
if isinstance(dest, Path):
|
|
414
404
|
dest = str(dest)
|
|
415
405
|
if dest is not None and not isinstance(dest, str):
|
|
416
|
-
raise excs.Error(f'{
|
|
406
|
+
raise excs.Error(f'{error_col_str}: `destination` must be a string or path; got {dest!r}')
|
|
417
407
|
|
|
418
408
|
# Specific checks for storage backends
|
|
419
409
|
store = cls.get_store(dest, False, col_name)
|
|
420
|
-
dest2 = store.validate(
|
|
410
|
+
dest2 = store.validate(error_col_str)
|
|
421
411
|
if dest2 is None:
|
|
422
|
-
raise excs.Error(f'{
|
|
412
|
+
raise excs.Error(f'{error_col_str}: `destination` must be a supported destination; got {dest!r}')
|
|
423
413
|
return dest2
|
|
424
414
|
|
|
425
415
|
@classmethod
|
|
@@ -427,7 +417,7 @@ class ObjectOps:
|
|
|
427
417
|
"""Copy an object from a URL to a local Path. Thread safe.
|
|
428
418
|
Raises an exception if the download fails or the scheme is not supported
|
|
429
419
|
"""
|
|
430
|
-
soa = ObjectPath.parse_object_storage_addr(src_uri,
|
|
420
|
+
soa = ObjectPath.parse_object_storage_addr(src_uri, allow_obj_name=True)
|
|
431
421
|
store = cls.get_store(src_uri, True)
|
|
432
422
|
store.copy_object_to_local_file(soa.object_name, dest_path)
|
|
433
423
|
|
|
@@ -466,7 +456,7 @@ class ObjectOps:
|
|
|
466
456
|
return store.copy_local_file(col, src_path)
|
|
467
457
|
|
|
468
458
|
@classmethod
|
|
469
|
-
def delete(cls, dest:
|
|
459
|
+
def delete(cls, dest: str | None, tbl_id: UUID, tbl_version: int | None = None) -> int | None:
|
|
470
460
|
"""Delete objects in the destination for a given table ID, table version.
|
|
471
461
|
Returns:
|
|
472
462
|
Number of objects deleted or None
|
|
@@ -475,13 +465,39 @@ class ObjectOps:
|
|
|
475
465
|
return store.delete(tbl_id, tbl_version)
|
|
476
466
|
|
|
477
467
|
@classmethod
|
|
478
|
-
def count(
|
|
479
|
-
|
|
468
|
+
def count(
|
|
469
|
+
cls,
|
|
470
|
+
tbl_id: UUID,
|
|
471
|
+
tbl_version: int | None = None,
|
|
472
|
+
dest: str | None = None,
|
|
473
|
+
default_input_dest: bool = False,
|
|
474
|
+
default_output_dest: bool = False,
|
|
475
|
+
) -> int:
|
|
476
|
+
"""
|
|
477
|
+
Return the count of objects in the destination for a given table ID.
|
|
478
|
+
|
|
479
|
+
At most one of dest, default_input, default_output may be specified. If none are specified, the fallback is the
|
|
480
|
+
local media directory.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
tbl_id: Table ID for which to count objects
|
|
484
|
+
tbl_version: If specified, only counts objects for a specific table version
|
|
485
|
+
dest: The destination to count objects in
|
|
486
|
+
default_input_dest: If `True`, use the default input media destination
|
|
487
|
+
default_output_dest: If `True`, use the default output media destination
|
|
488
|
+
"""
|
|
489
|
+
assert sum((dest is not None, default_input_dest, default_output_dest)) <= 1, (
|
|
490
|
+
'At most one of dest, default_input, default_output may be specified'
|
|
491
|
+
)
|
|
492
|
+
if default_input_dest:
|
|
493
|
+
dest = env.Env.get().default_input_media_dest
|
|
494
|
+
if default_output_dest:
|
|
495
|
+
dest = env.Env.get().default_output_media_dest
|
|
480
496
|
store = cls.get_store(dest, False)
|
|
481
497
|
return store.count(tbl_id, tbl_version)
|
|
482
498
|
|
|
483
499
|
@classmethod
|
|
484
|
-
def list_objects(cls, dest:
|
|
500
|
+
def list_objects(cls, dest: str | None, return_uri: bool, n_max: int = 10) -> list[str]:
|
|
485
501
|
"""Return a list of objects found in the specified destination bucket.
|
|
486
502
|
The dest specification string must not contain an object name.
|
|
487
503
|
Each returned object includes the full set of prefixes.
|