pixeltable 0.4.7__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/catalog/catalog.py +4 -6
- pixeltable/catalog/insertable_table.py +125 -28
- pixeltable/catalog/table.py +51 -15
- pixeltable/catalog/table_version.py +12 -8
- pixeltable/catalog/table_version_path.py +6 -5
- pixeltable/config.py +25 -9
- pixeltable/dataframe.py +3 -3
- pixeltable/env.py +89 -20
- pixeltable/exec/aggregation_node.py +1 -1
- pixeltable/exec/cache_prefetch_node.py +4 -3
- pixeltable/exec/exec_node.py +0 -8
- pixeltable/exec/expr_eval/globals.py +1 -0
- pixeltable/exec/expr_eval/schedulers.py +16 -4
- pixeltable/exec/in_memory_data_node.py +2 -3
- pixeltable/exprs/data_row.py +5 -5
- pixeltable/exprs/function_call.py +59 -21
- pixeltable/exprs/row_builder.py +11 -5
- pixeltable/func/expr_template_function.py +6 -3
- pixeltable/functions/__init__.py +2 -0
- pixeltable/functions/anthropic.py +1 -2
- pixeltable/functions/deepseek.py +5 -1
- pixeltable/functions/gemini.py +11 -2
- pixeltable/functions/huggingface.py +6 -12
- pixeltable/functions/openai.py +2 -1
- pixeltable/functions/video.py +5 -5
- pixeltable/functions/whisperx.py +177 -0
- pixeltable/{ext/functions → functions}/yolox.py +0 -4
- pixeltable/globals.py +16 -3
- pixeltable/io/fiftyone.py +3 -3
- pixeltable/io/label_studio.py +2 -1
- pixeltable/iterators/audio.py +3 -2
- pixeltable/iterators/document.py +0 -6
- pixeltable/metadata/__init__.py +3 -1
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +0 -16
- pixeltable/share/packager.py +6 -6
- pixeltable/share/publish.py +134 -7
- pixeltable/type_system.py +20 -4
- pixeltable/utils/media_store.py +131 -66
- pixeltable/utils/pydantic.py +60 -0
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/METADATA +186 -121
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/RECORD +47 -46
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/licenses/LICENSE +0 -0
pixeltable/share/publish.py
CHANGED
|
@@ -1,36 +1,45 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import sys
|
|
2
3
|
import urllib.parse
|
|
3
4
|
import urllib.request
|
|
4
5
|
from pathlib import Path
|
|
6
|
+
from typing import Literal, Optional
|
|
5
7
|
|
|
6
8
|
import requests
|
|
9
|
+
from requests.adapters import HTTPAdapter
|
|
7
10
|
from tqdm import tqdm
|
|
11
|
+
from urllib3.util.retry import Retry
|
|
8
12
|
|
|
9
13
|
import pixeltable as pxt
|
|
10
14
|
from pixeltable import exceptions as excs
|
|
11
15
|
from pixeltable.env import Env
|
|
12
16
|
from pixeltable.utils import sha256sum
|
|
17
|
+
from pixeltable.utils.media_store import TempStore
|
|
13
18
|
|
|
14
19
|
from .packager import TablePackager, TableRestorer
|
|
15
20
|
|
|
16
21
|
# These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
|
|
17
22
|
# pixeltable.com URLs are available.
|
|
18
23
|
|
|
19
|
-
PIXELTABLE_API_URL = 'https://internal-api.pixeltable.com'
|
|
24
|
+
PIXELTABLE_API_URL = os.environ.get('PIXELTABLE_API_URL', 'https://internal-api.pixeltable.com')
|
|
20
25
|
|
|
21
26
|
|
|
22
|
-
def push_replica(
|
|
23
|
-
|
|
27
|
+
def push_replica(
|
|
28
|
+
dest_tbl_uri: str, src_tbl: pxt.Table, bucket: str | None = None, access: Literal['public', 'private'] = 'private'
|
|
29
|
+
) -> str:
|
|
30
|
+
if not src_tbl._tbl_version_path.is_snapshot():
|
|
24
31
|
raise excs.Error('Only snapshots may be published.')
|
|
25
32
|
|
|
26
|
-
packager = TablePackager(
|
|
33
|
+
packager = TablePackager(
|
|
34
|
+
src_tbl, additional_md={'table_uri': dest_tbl_uri, 'bucket_name': bucket, 'is_public': access == 'public'}
|
|
35
|
+
)
|
|
27
36
|
request_json = packager.md | {'operation_type': 'publish_snapshot'}
|
|
28
37
|
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
29
38
|
response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=headers_json)
|
|
30
39
|
if response.status_code != 200:
|
|
31
40
|
raise excs.Error(f'Error publishing snapshot: {response.text}')
|
|
32
41
|
response_json = response.json()
|
|
33
|
-
if not isinstance(response_json, dict)
|
|
42
|
+
if not isinstance(response_json, dict):
|
|
34
43
|
raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
|
|
35
44
|
upload_id = response_json['upload_id']
|
|
36
45
|
destination_uri = response_json['destination_uri']
|
|
@@ -42,17 +51,23 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
|
42
51
|
parsed_location = urllib.parse.urlparse(destination_uri)
|
|
43
52
|
if parsed_location.scheme == 's3':
|
|
44
53
|
_upload_bundle_to_s3(bundle, parsed_location)
|
|
54
|
+
elif parsed_location.scheme == 'https':
|
|
55
|
+
_upload_to_presigned_url(file_path=bundle, url=parsed_location.geturl())
|
|
45
56
|
else:
|
|
46
57
|
raise excs.Error(f'Unsupported destination: {destination_uri}')
|
|
47
58
|
|
|
48
59
|
Env.get().console_logger.info('Finalizing snapshot ...')
|
|
49
60
|
|
|
50
61
|
finalize_request_json = {
|
|
62
|
+
'table_uri': dest_tbl_uri,
|
|
51
63
|
'operation_type': 'finalize_snapshot',
|
|
52
64
|
'upload_id': upload_id,
|
|
53
65
|
'datafile': bundle.name,
|
|
54
66
|
'size': bundle.stat().st_size,
|
|
55
67
|
'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
|
|
68
|
+
'rows': packager.md['row_count'], # TODO rename rows to row_count once cloud side changes are complete
|
|
69
|
+
'preview_header': packager.md['preview_header'],
|
|
70
|
+
'preview_data': packager.md['preview_data'],
|
|
56
71
|
}
|
|
57
72
|
# TODO: Use Pydantic for validation
|
|
58
73
|
finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=headers_json)
|
|
@@ -107,11 +122,14 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
|
|
|
107
122
|
raise excs.Error(f'Error cloning shapshot: unexpected response from server.\n{response_json}')
|
|
108
123
|
|
|
109
124
|
primary_tbl_additional_md = response_json['md']['tables'][0]['table_md']['additional_md']
|
|
110
|
-
bundle_uri =
|
|
125
|
+
bundle_uri = response_json['destination_uri']
|
|
111
126
|
bundle_filename = primary_tbl_additional_md['datafile']
|
|
112
127
|
parsed_location = urllib.parse.urlparse(bundle_uri)
|
|
113
128
|
if parsed_location.scheme == 's3':
|
|
114
129
|
bundle_path = _download_bundle_from_s3(parsed_location, bundle_filename)
|
|
130
|
+
elif parsed_location.scheme == 'https':
|
|
131
|
+
bundle_path = TempStore.create_path()
|
|
132
|
+
_download_from_presigned_url(url=parsed_location.geturl(), output_path=bundle_path)
|
|
115
133
|
else:
|
|
116
134
|
raise excs.Error(f'Unexpected response from server: unsupported bundle uri: {bundle_uri}')
|
|
117
135
|
|
|
@@ -136,7 +154,7 @@ def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_f
|
|
|
136
154
|
obj = s3_client.head_object(Bucket=bucket, Key=remote_path) # Check if the object exists
|
|
137
155
|
bundle_size = obj['ContentLength']
|
|
138
156
|
|
|
139
|
-
bundle_path =
|
|
157
|
+
bundle_path = TempStore.create_path()
|
|
140
158
|
progress_bar = tqdm(
|
|
141
159
|
desc='Downloading',
|
|
142
160
|
total=bundle_size,
|
|
@@ -149,3 +167,112 @@ def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_f
|
|
|
149
167
|
)
|
|
150
168
|
s3_client.download_file(Bucket=bucket, Key=remote_path, Filename=str(bundle_path), Callback=progress_bar.update)
|
|
151
169
|
return bundle_path
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _create_retry_session(
|
|
173
|
+
max_retries: int = 3, backoff_factor: float = 1.0, status_forcelist: Optional[list] = None
|
|
174
|
+
) -> requests.Session:
|
|
175
|
+
"""Create a requests session with retry configuration"""
|
|
176
|
+
if status_forcelist is None:
|
|
177
|
+
status_forcelist = [
|
|
178
|
+
408, # Request Timeout
|
|
179
|
+
429, # Too Many Requests (rate limiting)
|
|
180
|
+
500, # Internal Server Error (server-side error)
|
|
181
|
+
502, # Bad Gateway (proxy/gateway got invalid response)
|
|
182
|
+
503, # Service Unavailable (server overloaded or down)
|
|
183
|
+
504, # Gateway Timeout (proxy/gateway timeout)
|
|
184
|
+
]
|
|
185
|
+
retry_strategy = Retry(
|
|
186
|
+
total=max_retries,
|
|
187
|
+
read=max_retries,
|
|
188
|
+
connect=max_retries,
|
|
189
|
+
backoff_factor=backoff_factor,
|
|
190
|
+
status_forcelist=status_forcelist,
|
|
191
|
+
allowed_methods=['GET', 'PUT', 'POST', 'DELETE'],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
session = requests.Session()
|
|
195
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
196
|
+
session.mount('https://', adapter)
|
|
197
|
+
return session
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _upload_to_presigned_url(file_path: Path, url: str, max_retries: int = 3) -> requests.Response:
|
|
201
|
+
"""Upload file with progress bar and retries"""
|
|
202
|
+
file_size = file_path.stat().st_size
|
|
203
|
+
|
|
204
|
+
headers = {'Content-Length': str(file_size), 'Content-Type': 'application/octet-stream'}
|
|
205
|
+
|
|
206
|
+
session = _create_retry_session(max_retries=max_retries)
|
|
207
|
+
try:
|
|
208
|
+
with (
|
|
209
|
+
open(file_path, 'rb') as f,
|
|
210
|
+
tqdm.wrapattr(
|
|
211
|
+
f,
|
|
212
|
+
method='read',
|
|
213
|
+
total=file_size,
|
|
214
|
+
desc='Uploading',
|
|
215
|
+
unit='B',
|
|
216
|
+
unit_scale=True,
|
|
217
|
+
unit_divisor=1024,
|
|
218
|
+
miniters=1, # Update every iteration (should be fine for an upload)
|
|
219
|
+
ncols=100,
|
|
220
|
+
file=sys.stdout,
|
|
221
|
+
) as file_with_progress,
|
|
222
|
+
):
|
|
223
|
+
response = session.put(
|
|
224
|
+
url,
|
|
225
|
+
data=file_with_progress,
|
|
226
|
+
headers=headers,
|
|
227
|
+
timeout=(60, 1800), # 60 seconds to connect and 300 seconds for server response
|
|
228
|
+
)
|
|
229
|
+
response.raise_for_status()
|
|
230
|
+
return response
|
|
231
|
+
finally:
|
|
232
|
+
session.close()
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _download_from_presigned_url(
|
|
236
|
+
url: str, output_path: Path, headers: Optional[dict[str, str]] = None, max_retries: int = 3
|
|
237
|
+
) -> None:
|
|
238
|
+
"""Download file with progress bar and retries"""
|
|
239
|
+
session = _create_retry_session(max_retries=max_retries)
|
|
240
|
+
|
|
241
|
+
try:
|
|
242
|
+
# Stream download with progress
|
|
243
|
+
response = session.get(
|
|
244
|
+
url, headers=headers, stream=True, timeout=(60, 300)
|
|
245
|
+
) # 60 seconds to connect and 300 seconds for server response
|
|
246
|
+
response.raise_for_status()
|
|
247
|
+
|
|
248
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
249
|
+
progress_bar = tqdm(
|
|
250
|
+
desc='Downloading',
|
|
251
|
+
total=total_size,
|
|
252
|
+
unit='B',
|
|
253
|
+
unit_scale=True,
|
|
254
|
+
unit_divisor=1024,
|
|
255
|
+
miniters=1,
|
|
256
|
+
ncols=100,
|
|
257
|
+
file=sys.stdout,
|
|
258
|
+
)
|
|
259
|
+
with open(output_path, 'wb') as f:
|
|
260
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
261
|
+
if chunk:
|
|
262
|
+
f.write(chunk)
|
|
263
|
+
progress_bar.update(len(chunk))
|
|
264
|
+
finally:
|
|
265
|
+
session.close()
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
# TODO: This will be replaced by drop_table with cloud table uri
|
|
269
|
+
def delete_replica(dest_path: str) -> None:
|
|
270
|
+
"""Delete cloud replica"""
|
|
271
|
+
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
272
|
+
delete_request_json = {'operation_type': 'delete_snapshot', 'table_uri': dest_path}
|
|
273
|
+
response = requests.post(PIXELTABLE_API_URL, json=delete_request_json, headers=headers_json)
|
|
274
|
+
if response.status_code != 200:
|
|
275
|
+
raise excs.Error(f'Error deleting replica: {response.text}')
|
|
276
|
+
response_json = response.json()
|
|
277
|
+
if not isinstance(response_json, dict) or 'table_uri' not in response_json:
|
|
278
|
+
raise excs.Error(f'Error deleting replica: unexpected response from server.\n{response_json}')
|
pixeltable/type_system.py
CHANGED
|
@@ -9,8 +9,11 @@ import types
|
|
|
9
9
|
import typing
|
|
10
10
|
import urllib.parse
|
|
11
11
|
import urllib.request
|
|
12
|
+
from pathlib import Path
|
|
12
13
|
from typing import Any, ClassVar, Iterable, Literal, Mapping, Optional, Sequence, Union
|
|
13
14
|
|
|
15
|
+
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
16
|
+
|
|
14
17
|
import av
|
|
15
18
|
import jsonschema
|
|
16
19
|
import jsonschema.protocols
|
|
@@ -24,8 +27,6 @@ from typing_extensions import _AnnotatedAlias
|
|
|
24
27
|
import pixeltable.exceptions as excs
|
|
25
28
|
from pixeltable.utils import parse_local_file_path
|
|
26
29
|
|
|
27
|
-
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
28
|
-
|
|
29
30
|
|
|
30
31
|
class ColumnType:
|
|
31
32
|
@enum.unique
|
|
@@ -292,7 +293,11 @@ class ColumnType:
|
|
|
292
293
|
|
|
293
294
|
@classmethod
|
|
294
295
|
def from_python_type(
|
|
295
|
-
cls,
|
|
296
|
+
cls,
|
|
297
|
+
t: type | _GenericAlias,
|
|
298
|
+
nullable_default: bool = False,
|
|
299
|
+
allow_builtin_types: bool = True,
|
|
300
|
+
infer_pydantic_json: bool = False,
|
|
296
301
|
) -> Optional[ColumnType]:
|
|
297
302
|
"""
|
|
298
303
|
Convert a Python type into a Pixeltable `ColumnType` instance.
|
|
@@ -305,6 +310,8 @@ class ColumnType:
|
|
|
305
310
|
allowed (as in UDF definitions). If False, then only Pixeltable types such as `pxt.String`,
|
|
306
311
|
`pxt.Int`, etc., will be allowed (as in schema definitions). `Optional` and `Required`
|
|
307
312
|
designations will be allowed regardless.
|
|
313
|
+
infer_pydantic_json: If True, accepts an extended set of built-ins (eg, Enum, Path) and returns the type to
|
|
314
|
+
which pydantic.BaseModel.model_dump(mode='json') serializes it.
|
|
308
315
|
"""
|
|
309
316
|
origin = typing.get_origin(t)
|
|
310
317
|
type_args = typing.get_args(t)
|
|
@@ -314,7 +321,9 @@ class ColumnType:
|
|
|
314
321
|
# `t` is a type of the form Optional[T] (equivalently, T | None or None | T).
|
|
315
322
|
# We treat it as the underlying type but with nullable=True.
|
|
316
323
|
underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
|
|
317
|
-
underlying = cls.from_python_type(
|
|
324
|
+
underlying = cls.from_python_type(
|
|
325
|
+
underlying_py_type, allow_builtin_types=allow_builtin_types, infer_pydantic_json=infer_pydantic_json
|
|
326
|
+
)
|
|
318
327
|
if underlying is not None:
|
|
319
328
|
return underlying.copy(nullable=True)
|
|
320
329
|
elif origin is Required:
|
|
@@ -341,6 +350,13 @@ class ColumnType:
|
|
|
341
350
|
if literal_type is None:
|
|
342
351
|
return None
|
|
343
352
|
return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
|
|
353
|
+
if infer_pydantic_json and isinstance(t, type) and issubclass(t, enum.Enum):
|
|
354
|
+
literal_type = cls.infer_common_literal_type(member.value for member in t)
|
|
355
|
+
if literal_type is None:
|
|
356
|
+
return None
|
|
357
|
+
return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
|
|
358
|
+
if infer_pydantic_json and t is Path:
|
|
359
|
+
return StringType(nullable=nullable_default)
|
|
344
360
|
if t is str:
|
|
345
361
|
return StringType(nullable=nullable_default)
|
|
346
362
|
if t is int:
|
pixeltable/utils/media_store.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import glob
|
|
4
|
+
import logging
|
|
4
5
|
import os
|
|
5
6
|
import re
|
|
6
7
|
import shutil
|
|
@@ -19,10 +20,12 @@ from pixeltable import env
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
20
21
|
from pixeltable.catalog import Column
|
|
21
22
|
|
|
23
|
+
_logger = logging.getLogger('pixeltable')
|
|
24
|
+
|
|
22
25
|
|
|
23
26
|
class MediaStore:
|
|
24
27
|
"""
|
|
25
|
-
Utilities to manage media files stored in
|
|
28
|
+
Utilities to manage media files stored in a local filesystem directory.
|
|
26
29
|
|
|
27
30
|
Media file names are a composite of: table id, column id, tbl_version, new uuid:
|
|
28
31
|
the table id/column id/tbl_version are redundant but useful for identifying all files for a table
|
|
@@ -30,38 +33,70 @@ class MediaStore:
|
|
|
30
33
|
"""
|
|
31
34
|
|
|
32
35
|
pattern = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
|
|
36
|
+
__base_dir: Path
|
|
37
|
+
|
|
38
|
+
def __init__(self, base_dir: Path):
|
|
39
|
+
"""Initialize a MediaStore with a base directory."""
|
|
40
|
+
assert isinstance(base_dir, Path), 'Base directory must be a Path instance.'
|
|
41
|
+
self.__base_dir = base_dir
|
|
33
42
|
|
|
34
43
|
@classmethod
|
|
35
|
-
def
|
|
36
|
-
"""
|
|
37
|
-
|
|
44
|
+
def get(cls, base_uri: Optional[Path] = None) -> MediaStore:
|
|
45
|
+
"""Get a MediaStore instance for the given base URI, or the environment's media_dir if None."""
|
|
46
|
+
if base_uri is None:
|
|
47
|
+
return MediaStore(env.Env.get().media_dir)
|
|
48
|
+
raise NotImplementedError
|
|
38
49
|
|
|
39
50
|
@classmethod
|
|
40
|
-
def
|
|
41
|
-
"""
|
|
42
|
-
|
|
51
|
+
def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
|
|
52
|
+
"""Save a media binary data to a file in a MediaStore. format is ignored for binary data."""
|
|
53
|
+
assert isinstance(file_data, bytes)
|
|
54
|
+
with open(dest_path, 'wb') as f:
|
|
55
|
+
f.write(file_data)
|
|
56
|
+
f.flush() # Ensures Python buffers are written to OS
|
|
57
|
+
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
58
|
+
return dest_path
|
|
43
59
|
|
|
44
60
|
@classmethod
|
|
45
|
-
def
|
|
61
|
+
def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
|
|
62
|
+
"""Save a PIL Image to a file in a MediaStore with the specified format."""
|
|
63
|
+
if dest_path.suffix != f'.{format}':
|
|
64
|
+
dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
|
|
65
|
+
|
|
66
|
+
with open(dest_path, 'wb') as f:
|
|
67
|
+
image.save(f, format=format)
|
|
68
|
+
f.flush() # Ensures Python buffers are written to OS
|
|
69
|
+
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
70
|
+
return dest_path
|
|
71
|
+
|
|
72
|
+
def _prepare_media_path_raw(self, tbl_id: UUID, col_id: int, tbl_version: int, ext: Optional[str] = None) -> Path:
|
|
46
73
|
"""
|
|
47
74
|
Construct a new, unique Path name for a persisted media file, and create the parent directory
|
|
48
75
|
for the new Path if it does not already exist. The Path will reside in
|
|
49
76
|
the environment's media_dir.
|
|
50
77
|
"""
|
|
51
78
|
id_hex = uuid.uuid4().hex
|
|
52
|
-
parent =
|
|
79
|
+
parent = self.__base_dir / tbl_id.hex / id_hex[:2] / id_hex[:4]
|
|
53
80
|
parent.mkdir(parents=True, exist_ok=True)
|
|
54
|
-
return parent / f'{
|
|
81
|
+
return parent / f'{tbl_id.hex}_{col_id}_{tbl_version}_{id_hex}{ext or ""}'
|
|
55
82
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
83
|
+
def _prepare_media_path(self, col: Column, ext: Optional[str] = None) -> Path:
|
|
84
|
+
"""
|
|
85
|
+
Construct a new, unique Path name for a persisted media file, and create the parent directory
|
|
86
|
+
for the new Path if it does not already exist. The Path will reside in
|
|
87
|
+
the environment's media_dir.
|
|
88
|
+
"""
|
|
89
|
+
assert col.tbl is not None, 'Column must be associated with a table'
|
|
90
|
+
return self._prepare_media_path_raw(col.tbl.id, col.id, col.tbl.version, ext)
|
|
91
|
+
|
|
92
|
+
def resolve_url(self, file_url: Optional[str]) -> Optional[Path]:
|
|
93
|
+
"""Return path if the given url refers to a file managed by this MediaStore, else None.
|
|
59
94
|
|
|
60
95
|
Args:
|
|
61
|
-
file_url: URL
|
|
96
|
+
file_url: URL to check
|
|
62
97
|
|
|
63
98
|
Returns:
|
|
64
|
-
If the
|
|
99
|
+
If the url is a managed file, return a Path() to the file, None, otherwise
|
|
65
100
|
"""
|
|
66
101
|
if file_url is None:
|
|
67
102
|
return None
|
|
@@ -74,93 +109,76 @@ class MediaStore:
|
|
|
74
109
|
# remote url
|
|
75
110
|
return None
|
|
76
111
|
src_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
|
|
77
|
-
|
|
78
|
-
if not src_path.startswith(pxt_tmp_dir):
|
|
112
|
+
if not src_path.startswith(str(self.__base_dir)):
|
|
79
113
|
# not a tmp file
|
|
80
114
|
return None
|
|
81
115
|
return Path(src_path)
|
|
82
116
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
dest_path = cls._prepare_media_path(col, ext=src_path.suffix)
|
|
117
|
+
def relocate_local_media_file(self, src_path: Path, col: Column) -> str:
|
|
118
|
+
"""Relocate a local file to a MediaStore, and return its new URL"""
|
|
119
|
+
dest_path = self._prepare_media_path(col, ext=src_path.suffix)
|
|
87
120
|
src_path.rename(dest_path)
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
121
|
+
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
122
|
+
_logger.debug(f'Media Storage: moved {src_path} to {new_file_url}')
|
|
123
|
+
return new_file_url
|
|
124
|
+
|
|
125
|
+
def copy_local_media_file(self, src_path: Path, col: Column) -> str:
|
|
126
|
+
"""Copy a local file to a MediaStore, and return its new URL"""
|
|
127
|
+
dest_path = self._prepare_media_path(col, ext=src_path.suffix)
|
|
128
|
+
shutil.copy2(src_path, dest_path)
|
|
129
|
+
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
130
|
+
_logger.debug(f'Media Storage: copied {src_path} to {new_file_url}')
|
|
131
|
+
return new_file_url
|
|
132
|
+
|
|
133
|
+
def save_media_object(self, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
|
|
134
|
+
"""Save a media data object to a file in a MediaStore
|
|
93
135
|
Returns:
|
|
94
136
|
dest_path: Path to the saved media file
|
|
95
137
|
url: URL of the saved media file
|
|
96
138
|
"""
|
|
97
139
|
assert col.col_type.is_media_type(), f'MediaStore: request to store non media_type Column {col.name}'
|
|
98
|
-
dest_path =
|
|
140
|
+
dest_path = self._prepare_media_path(col)
|
|
99
141
|
if isinstance(data, bytes):
|
|
100
|
-
dest_path =
|
|
142
|
+
dest_path = self._save_binary_media_file(data, dest_path, format)
|
|
101
143
|
elif isinstance(data, PIL.Image.Image):
|
|
102
|
-
dest_path =
|
|
144
|
+
dest_path = self._save_pil_image_file(data, dest_path, format)
|
|
103
145
|
else:
|
|
104
146
|
raise ValueError(f'Unsupported media object type: {type(data)}')
|
|
105
|
-
|
|
106
|
-
return dest_path,
|
|
147
|
+
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
148
|
+
return dest_path, new_file_url
|
|
107
149
|
|
|
108
|
-
|
|
109
|
-
def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
|
|
110
|
-
"""Save a media binary data to a file in the MediaStore. format is ignored for binary data."""
|
|
111
|
-
assert isinstance(file_data, bytes)
|
|
112
|
-
with open(dest_path, 'wb') as f:
|
|
113
|
-
f.write(file_data)
|
|
114
|
-
f.flush() # Ensures Python buffers are written to OS
|
|
115
|
-
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
116
|
-
return dest_path
|
|
117
|
-
|
|
118
|
-
@classmethod
|
|
119
|
-
def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
|
|
120
|
-
"""Save a PIL Image to a file in the MediaStore with the specified format."""
|
|
121
|
-
if dest_path.suffix != f'.{format}':
|
|
122
|
-
dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
|
|
123
|
-
|
|
124
|
-
with open(dest_path, 'wb') as f:
|
|
125
|
-
image.save(f, format=format)
|
|
126
|
-
f.flush() # Ensures Python buffers are written to OS
|
|
127
|
-
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
128
|
-
return dest_path
|
|
129
|
-
|
|
130
|
-
@classmethod
|
|
131
|
-
def delete(cls, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
|
|
150
|
+
def delete(self, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
|
|
132
151
|
"""Delete all files belonging to tbl_id. If tbl_version is not None, delete
|
|
133
152
|
only those files belonging to the specified tbl_version."""
|
|
134
153
|
assert tbl_id is not None
|
|
135
154
|
if tbl_version is None:
|
|
136
155
|
# Remove the entire folder for this table id.
|
|
137
|
-
path =
|
|
156
|
+
path = self.__base_dir / tbl_id.hex
|
|
138
157
|
if path.exists():
|
|
139
158
|
shutil.rmtree(path)
|
|
140
159
|
else:
|
|
141
160
|
# Remove only the elements for the specified tbl_version.
|
|
142
|
-
paths = glob.glob(
|
|
143
|
-
str(cls._media_dir() / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True
|
|
144
|
-
)
|
|
161
|
+
paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True)
|
|
145
162
|
for p in paths:
|
|
146
163
|
os.remove(p)
|
|
147
164
|
|
|
148
|
-
|
|
149
|
-
def count(cls, tbl_id: UUID) -> int:
|
|
165
|
+
def count(self, tbl_id: Optional[UUID]) -> int:
|
|
150
166
|
"""
|
|
151
167
|
Return number of files for given tbl_id.
|
|
152
168
|
"""
|
|
153
|
-
|
|
169
|
+
if tbl_id is None:
|
|
170
|
+
paths = glob.glob(str(self.__base_dir / '*'), recursive=True)
|
|
171
|
+
else:
|
|
172
|
+
paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
|
|
154
173
|
return len(paths)
|
|
155
174
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
paths = glob.glob(str(cls._media_dir()) + '/**', recursive=True)
|
|
175
|
+
def stats(self) -> list[tuple[UUID, int, int, int]]:
|
|
176
|
+
paths = glob.glob(str(self.__base_dir) + '/**', recursive=True)
|
|
159
177
|
# key: (tbl_id, col_id), value: (num_files, size)
|
|
160
178
|
d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
|
|
161
179
|
for p in paths:
|
|
162
180
|
if not os.path.isdir(p):
|
|
163
|
-
matched = re.match(
|
|
181
|
+
matched = re.match(self.pattern, Path(p).name)
|
|
164
182
|
assert matched is not None
|
|
165
183
|
tbl_id, col_id = UUID(hex=matched[1]), int(matched[2])
|
|
166
184
|
file_info = os.stat(p)
|
|
@@ -170,3 +188,50 @@ class MediaStore:
|
|
|
170
188
|
result = [(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()]
|
|
171
189
|
result.sort(key=lambda e: e[3], reverse=True)
|
|
172
190
|
return result
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class TempStore:
|
|
194
|
+
"""
|
|
195
|
+
A temporary store for files of data that are not yet persisted to their destination(s).
|
|
196
|
+
A destination is typically either a MediaStore (local persisted files) or a cloud object store.
|
|
197
|
+
|
|
198
|
+
The TempStore class has no internal state. It provides functionality to manage temporary files
|
|
199
|
+
in the env.Env.get().tmp_dir directory.
|
|
200
|
+
It reuses some of the MediaStore functionality to create unique file names and save objects.
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
@classmethod
|
|
204
|
+
def _tmp_dir(cls) -> Path:
|
|
205
|
+
"""Returns the path to the temporary directory where files are stored."""
|
|
206
|
+
from pixeltable import env
|
|
207
|
+
|
|
208
|
+
return env.Env.get().tmp_dir
|
|
209
|
+
|
|
210
|
+
@classmethod
|
|
211
|
+
def count(cls, tbl_id: Optional[UUID] = None) -> int:
|
|
212
|
+
return MediaStore(cls._tmp_dir()).count(tbl_id)
|
|
213
|
+
|
|
214
|
+
@classmethod
|
|
215
|
+
def resolve_url(cls, file_url: Optional[str]) -> Optional[Path]:
|
|
216
|
+
return MediaStore(cls._tmp_dir()).resolve_url(file_url)
|
|
217
|
+
|
|
218
|
+
@classmethod
|
|
219
|
+
def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
|
|
220
|
+
return MediaStore(cls._tmp_dir()).save_media_object(data, col, format)
|
|
221
|
+
|
|
222
|
+
@classmethod
|
|
223
|
+
def delete_media_file(cls, obj_path: Path) -> None:
|
|
224
|
+
"""Delete a media object from the temporary store."""
|
|
225
|
+
assert obj_path is not None, 'Object path must be provided'
|
|
226
|
+
assert obj_path.exists(), f'Object path does not exist: {obj_path}'
|
|
227
|
+
assert cls.resolve_url(str(obj_path)) is not None, f'Object path is not a valid media store path: {obj_path}'
|
|
228
|
+
obj_path.unlink()
|
|
229
|
+
|
|
230
|
+
@classmethod
|
|
231
|
+
def create_path(cls, tbl_id: Optional[UUID] = None, extension: str = '') -> Path:
|
|
232
|
+
"""Return a new, unique Path located in the temporary store.
|
|
233
|
+
If tbl_id is provided, the path name will be similar to a MediaStore path based on the tbl_id.
|
|
234
|
+
If tbl_id is None, a random UUID will be used to create the path."""
|
|
235
|
+
if tbl_id is not None:
|
|
236
|
+
return MediaStore(cls._tmp_dir())._prepare_media_path_raw(tbl_id, 0, 0, extension)
|
|
237
|
+
return cls._tmp_dir() / f'{uuid.uuid4()}{extension}'
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from types import UnionType
|
|
5
|
+
from typing import Any, Union
|
|
6
|
+
|
|
7
|
+
import pydantic
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def is_json_convertible(model: type[pydantic.BaseModel]) -> bool:
|
|
11
|
+
"""
|
|
12
|
+
Determine if instances of a Pydantic model can be converted to valid JSON
|
|
13
|
+
based on the type hints of its fields.
|
|
14
|
+
"""
|
|
15
|
+
type_hints = typing.get_type_hints(model)
|
|
16
|
+
return all(_type_is_json_convertible(field_type) for field_type in type_hints.values())
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _type_is_json_convertible(type_hint: Any) -> bool:
|
|
20
|
+
"""
|
|
21
|
+
Recursively check if a type hint represents a JSON-compatible type.
|
|
22
|
+
|
|
23
|
+
TODO: also allow ndarrays and PIL.Image.Image, once we support those within json structures.
|
|
24
|
+
"""
|
|
25
|
+
if type_hint is type(None):
|
|
26
|
+
return True
|
|
27
|
+
if type_hint is Any:
|
|
28
|
+
return False
|
|
29
|
+
|
|
30
|
+
if type_hint in (str, int, float, bool, datetime):
|
|
31
|
+
return True
|
|
32
|
+
|
|
33
|
+
if isinstance(type_hint, type) and issubclass(type_hint, Enum):
|
|
34
|
+
return all(isinstance(member.value, (str, int, float, bool, type(None))) for member in type_hint)
|
|
35
|
+
|
|
36
|
+
if isinstance(type_hint, type) and issubclass(type_hint, pydantic.BaseModel):
|
|
37
|
+
return is_json_convertible(type_hint)
|
|
38
|
+
|
|
39
|
+
origin = typing.get_origin(type_hint)
|
|
40
|
+
args = typing.get_args(type_hint)
|
|
41
|
+
|
|
42
|
+
if origin in (Union, UnionType):
|
|
43
|
+
return all(_type_is_json_convertible(arg) for arg in args)
|
|
44
|
+
|
|
45
|
+
if origin in (list, tuple):
|
|
46
|
+
return all(_type_is_json_convertible(arg) for arg in args) if len(args) > 0 else False
|
|
47
|
+
|
|
48
|
+
if origin is dict:
|
|
49
|
+
if len(args) != 2:
|
|
50
|
+
# we can't tell what this is
|
|
51
|
+
return False
|
|
52
|
+
key_type, value_type = args
|
|
53
|
+
# keys must be strings, values must be json-convertible
|
|
54
|
+
return key_type is str and _type_is_json_convertible(value_type)
|
|
55
|
+
|
|
56
|
+
# Literal types are json-convertible if their values are
|
|
57
|
+
if origin is typing.Literal:
|
|
58
|
+
return all(isinstance(val, (str, int, float, bool, type(None))) for val in args)
|
|
59
|
+
|
|
60
|
+
return False
|