pixeltable 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/catalog/catalog.py +4 -6
- pixeltable/catalog/table.py +41 -14
- pixeltable/catalog/table_version.py +12 -8
- pixeltable/catalog/table_version_path.py +6 -5
- pixeltable/config.py +24 -9
- pixeltable/dataframe.py +3 -3
- pixeltable/env.py +70 -16
- pixeltable/exec/aggregation_node.py +1 -1
- pixeltable/exec/cache_prefetch_node.py +4 -3
- pixeltable/exec/exec_node.py +0 -8
- pixeltable/exec/expr_eval/globals.py +1 -0
- pixeltable/exec/expr_eval/schedulers.py +16 -4
- pixeltable/exec/in_memory_data_node.py +2 -3
- pixeltable/exprs/data_row.py +5 -5
- pixeltable/exprs/function_call.py +59 -21
- pixeltable/exprs/row_builder.py +11 -5
- pixeltable/func/expr_template_function.py +6 -3
- pixeltable/functions/anthropic.py +1 -2
- pixeltable/functions/deepseek.py +5 -1
- pixeltable/functions/gemini.py +11 -2
- pixeltable/functions/huggingface.py +6 -12
- pixeltable/functions/openai.py +2 -1
- pixeltable/functions/video.py +5 -5
- pixeltable/globals.py +13 -2
- pixeltable/io/fiftyone.py +3 -3
- pixeltable/io/label_studio.py +2 -1
- pixeltable/iterators/audio.py +3 -2
- pixeltable/iterators/document.py +0 -6
- pixeltable/plan.py +0 -16
- pixeltable/share/packager.py +6 -6
- pixeltable/share/publish.py +134 -7
- pixeltable/utils/media_store.py +131 -66
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/METADATA +186 -121
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/RECORD +37 -37
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/licenses/LICENSE +0 -0
pixeltable/share/publish.py
CHANGED
|
@@ -1,36 +1,45 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import sys
|
|
2
3
|
import urllib.parse
|
|
3
4
|
import urllib.request
|
|
4
5
|
from pathlib import Path
|
|
6
|
+
from typing import Literal, Optional
|
|
5
7
|
|
|
6
8
|
import requests
|
|
9
|
+
from requests.adapters import HTTPAdapter
|
|
7
10
|
from tqdm import tqdm
|
|
11
|
+
from urllib3.util.retry import Retry
|
|
8
12
|
|
|
9
13
|
import pixeltable as pxt
|
|
10
14
|
from pixeltable import exceptions as excs
|
|
11
15
|
from pixeltable.env import Env
|
|
12
16
|
from pixeltable.utils import sha256sum
|
|
17
|
+
from pixeltable.utils.media_store import TempStore
|
|
13
18
|
|
|
14
19
|
from .packager import TablePackager, TableRestorer
|
|
15
20
|
|
|
16
21
|
# These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
|
|
17
22
|
# pixeltable.com URLs are available.
|
|
18
23
|
|
|
19
|
-
PIXELTABLE_API_URL = 'https://internal-api.pixeltable.com'
|
|
24
|
+
PIXELTABLE_API_URL = os.environ.get('PIXELTABLE_API_URL', 'https://internal-api.pixeltable.com')
|
|
20
25
|
|
|
21
26
|
|
|
22
|
-
def push_replica(
|
|
23
|
-
|
|
27
|
+
def push_replica(
|
|
28
|
+
dest_tbl_uri: str, src_tbl: pxt.Table, bucket: str | None = None, access: Literal['public', 'private'] = 'private'
|
|
29
|
+
) -> str:
|
|
30
|
+
if not src_tbl._tbl_version_path.is_snapshot():
|
|
24
31
|
raise excs.Error('Only snapshots may be published.')
|
|
25
32
|
|
|
26
|
-
packager = TablePackager(
|
|
33
|
+
packager = TablePackager(
|
|
34
|
+
src_tbl, additional_md={'table_uri': dest_tbl_uri, 'bucket_name': bucket, 'is_public': access == 'public'}
|
|
35
|
+
)
|
|
27
36
|
request_json = packager.md | {'operation_type': 'publish_snapshot'}
|
|
28
37
|
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
29
38
|
response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=headers_json)
|
|
30
39
|
if response.status_code != 200:
|
|
31
40
|
raise excs.Error(f'Error publishing snapshot: {response.text}')
|
|
32
41
|
response_json = response.json()
|
|
33
|
-
if not isinstance(response_json, dict)
|
|
42
|
+
if not isinstance(response_json, dict):
|
|
34
43
|
raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
|
|
35
44
|
upload_id = response_json['upload_id']
|
|
36
45
|
destination_uri = response_json['destination_uri']
|
|
@@ -42,17 +51,23 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
|
42
51
|
parsed_location = urllib.parse.urlparse(destination_uri)
|
|
43
52
|
if parsed_location.scheme == 's3':
|
|
44
53
|
_upload_bundle_to_s3(bundle, parsed_location)
|
|
54
|
+
elif parsed_location.scheme == 'https':
|
|
55
|
+
_upload_to_presigned_url(file_path=bundle, url=parsed_location.geturl())
|
|
45
56
|
else:
|
|
46
57
|
raise excs.Error(f'Unsupported destination: {destination_uri}')
|
|
47
58
|
|
|
48
59
|
Env.get().console_logger.info('Finalizing snapshot ...')
|
|
49
60
|
|
|
50
61
|
finalize_request_json = {
|
|
62
|
+
'table_uri': dest_tbl_uri,
|
|
51
63
|
'operation_type': 'finalize_snapshot',
|
|
52
64
|
'upload_id': upload_id,
|
|
53
65
|
'datafile': bundle.name,
|
|
54
66
|
'size': bundle.stat().st_size,
|
|
55
67
|
'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
|
|
68
|
+
'rows': packager.md['row_count'], # TODO rename rows to row_count once cloud side changes are complete
|
|
69
|
+
'preview_header': packager.md['preview_header'],
|
|
70
|
+
'preview_data': packager.md['preview_data'],
|
|
56
71
|
}
|
|
57
72
|
# TODO: Use Pydantic for validation
|
|
58
73
|
finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=headers_json)
|
|
@@ -107,11 +122,14 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
|
|
|
107
122
|
raise excs.Error(f'Error cloning shapshot: unexpected response from server.\n{response_json}')
|
|
108
123
|
|
|
109
124
|
primary_tbl_additional_md = response_json['md']['tables'][0]['table_md']['additional_md']
|
|
110
|
-
bundle_uri =
|
|
125
|
+
bundle_uri = response_json['destination_uri']
|
|
111
126
|
bundle_filename = primary_tbl_additional_md['datafile']
|
|
112
127
|
parsed_location = urllib.parse.urlparse(bundle_uri)
|
|
113
128
|
if parsed_location.scheme == 's3':
|
|
114
129
|
bundle_path = _download_bundle_from_s3(parsed_location, bundle_filename)
|
|
130
|
+
elif parsed_location.scheme == 'https':
|
|
131
|
+
bundle_path = TempStore.create_path()
|
|
132
|
+
_download_from_presigned_url(url=parsed_location.geturl(), output_path=bundle_path)
|
|
115
133
|
else:
|
|
116
134
|
raise excs.Error(f'Unexpected response from server: unsupported bundle uri: {bundle_uri}')
|
|
117
135
|
|
|
@@ -136,7 +154,7 @@ def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_f
|
|
|
136
154
|
obj = s3_client.head_object(Bucket=bucket, Key=remote_path) # Check if the object exists
|
|
137
155
|
bundle_size = obj['ContentLength']
|
|
138
156
|
|
|
139
|
-
bundle_path =
|
|
157
|
+
bundle_path = TempStore.create_path()
|
|
140
158
|
progress_bar = tqdm(
|
|
141
159
|
desc='Downloading',
|
|
142
160
|
total=bundle_size,
|
|
@@ -149,3 +167,112 @@ def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_f
|
|
|
149
167
|
)
|
|
150
168
|
s3_client.download_file(Bucket=bucket, Key=remote_path, Filename=str(bundle_path), Callback=progress_bar.update)
|
|
151
169
|
return bundle_path
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _create_retry_session(
|
|
173
|
+
max_retries: int = 3, backoff_factor: float = 1.0, status_forcelist: Optional[list] = None
|
|
174
|
+
) -> requests.Session:
|
|
175
|
+
"""Create a requests session with retry configuration"""
|
|
176
|
+
if status_forcelist is None:
|
|
177
|
+
status_forcelist = [
|
|
178
|
+
408, # Request Timeout
|
|
179
|
+
429, # Too Many Requests (rate limiting)
|
|
180
|
+
500, # Internal Server Error (server-side error)
|
|
181
|
+
502, # Bad Gateway (proxy/gateway got invalid response)
|
|
182
|
+
503, # Service Unavailable (server overloaded or down)
|
|
183
|
+
504, # Gateway Timeout (proxy/gateway timeout)
|
|
184
|
+
]
|
|
185
|
+
retry_strategy = Retry(
|
|
186
|
+
total=max_retries,
|
|
187
|
+
read=max_retries,
|
|
188
|
+
connect=max_retries,
|
|
189
|
+
backoff_factor=backoff_factor,
|
|
190
|
+
status_forcelist=status_forcelist,
|
|
191
|
+
allowed_methods=['GET', 'PUT', 'POST', 'DELETE'],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
session = requests.Session()
|
|
195
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
196
|
+
session.mount('https://', adapter)
|
|
197
|
+
return session
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _upload_to_presigned_url(file_path: Path, url: str, max_retries: int = 3) -> requests.Response:
|
|
201
|
+
"""Upload file with progress bar and retries"""
|
|
202
|
+
file_size = file_path.stat().st_size
|
|
203
|
+
|
|
204
|
+
headers = {'Content-Length': str(file_size), 'Content-Type': 'application/octet-stream'}
|
|
205
|
+
|
|
206
|
+
session = _create_retry_session(max_retries=max_retries)
|
|
207
|
+
try:
|
|
208
|
+
with (
|
|
209
|
+
open(file_path, 'rb') as f,
|
|
210
|
+
tqdm.wrapattr(
|
|
211
|
+
f,
|
|
212
|
+
method='read',
|
|
213
|
+
total=file_size,
|
|
214
|
+
desc='Uploading',
|
|
215
|
+
unit='B',
|
|
216
|
+
unit_scale=True,
|
|
217
|
+
unit_divisor=1024,
|
|
218
|
+
miniters=1, # Update every iteration (should be fine for an upload)
|
|
219
|
+
ncols=100,
|
|
220
|
+
file=sys.stdout,
|
|
221
|
+
) as file_with_progress,
|
|
222
|
+
):
|
|
223
|
+
response = session.put(
|
|
224
|
+
url,
|
|
225
|
+
data=file_with_progress,
|
|
226
|
+
headers=headers,
|
|
227
|
+
timeout=(60, 1800), # 60 seconds to connect and 300 seconds for server response
|
|
228
|
+
)
|
|
229
|
+
response.raise_for_status()
|
|
230
|
+
return response
|
|
231
|
+
finally:
|
|
232
|
+
session.close()
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _download_from_presigned_url(
|
|
236
|
+
url: str, output_path: Path, headers: Optional[dict[str, str]] = None, max_retries: int = 3
|
|
237
|
+
) -> None:
|
|
238
|
+
"""Download file with progress bar and retries"""
|
|
239
|
+
session = _create_retry_session(max_retries=max_retries)
|
|
240
|
+
|
|
241
|
+
try:
|
|
242
|
+
# Stream download with progress
|
|
243
|
+
response = session.get(
|
|
244
|
+
url, headers=headers, stream=True, timeout=(60, 300)
|
|
245
|
+
) # 60 seconds to connect and 300 seconds for server response
|
|
246
|
+
response.raise_for_status()
|
|
247
|
+
|
|
248
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
249
|
+
progress_bar = tqdm(
|
|
250
|
+
desc='Downloading',
|
|
251
|
+
total=total_size,
|
|
252
|
+
unit='B',
|
|
253
|
+
unit_scale=True,
|
|
254
|
+
unit_divisor=1024,
|
|
255
|
+
miniters=1,
|
|
256
|
+
ncols=100,
|
|
257
|
+
file=sys.stdout,
|
|
258
|
+
)
|
|
259
|
+
with open(output_path, 'wb') as f:
|
|
260
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
261
|
+
if chunk:
|
|
262
|
+
f.write(chunk)
|
|
263
|
+
progress_bar.update(len(chunk))
|
|
264
|
+
finally:
|
|
265
|
+
session.close()
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
# TODO: This will be replaced by drop_table with cloud table uri
|
|
269
|
+
def delete_replica(dest_path: str) -> None:
|
|
270
|
+
"""Delete cloud replica"""
|
|
271
|
+
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
272
|
+
delete_request_json = {'operation_type': 'delete_snapshot', 'table_uri': dest_path}
|
|
273
|
+
response = requests.post(PIXELTABLE_API_URL, json=delete_request_json, headers=headers_json)
|
|
274
|
+
if response.status_code != 200:
|
|
275
|
+
raise excs.Error(f'Error deleting replica: {response.text}')
|
|
276
|
+
response_json = response.json()
|
|
277
|
+
if not isinstance(response_json, dict) or 'table_uri' not in response_json:
|
|
278
|
+
raise excs.Error(f'Error deleting replica: unexpected response from server.\n{response_json}')
|
pixeltable/utils/media_store.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import glob
|
|
4
|
+
import logging
|
|
4
5
|
import os
|
|
5
6
|
import re
|
|
6
7
|
import shutil
|
|
@@ -19,10 +20,12 @@ from pixeltable import env
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
20
21
|
from pixeltable.catalog import Column
|
|
21
22
|
|
|
23
|
+
_logger = logging.getLogger('pixeltable')
|
|
24
|
+
|
|
22
25
|
|
|
23
26
|
class MediaStore:
|
|
24
27
|
"""
|
|
25
|
-
Utilities to manage media files stored in
|
|
28
|
+
Utilities to manage media files stored in a local filesystem directory.
|
|
26
29
|
|
|
27
30
|
Media file names are a composite of: table id, column id, tbl_version, new uuid:
|
|
28
31
|
the table id/column id/tbl_version are redundant but useful for identifying all files for a table
|
|
@@ -30,38 +33,70 @@ class MediaStore:
|
|
|
30
33
|
"""
|
|
31
34
|
|
|
32
35
|
pattern = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
|
|
36
|
+
__base_dir: Path
|
|
37
|
+
|
|
38
|
+
def __init__(self, base_dir: Path):
|
|
39
|
+
"""Initialize a MediaStore with a base directory."""
|
|
40
|
+
assert isinstance(base_dir, Path), 'Base directory must be a Path instance.'
|
|
41
|
+
self.__base_dir = base_dir
|
|
33
42
|
|
|
34
43
|
@classmethod
|
|
35
|
-
def
|
|
36
|
-
"""
|
|
37
|
-
|
|
44
|
+
def get(cls, base_uri: Optional[Path] = None) -> MediaStore:
|
|
45
|
+
"""Get a MediaStore instance for the given base URI, or the environment's media_dir if None."""
|
|
46
|
+
if base_uri is None:
|
|
47
|
+
return MediaStore(env.Env.get().media_dir)
|
|
48
|
+
raise NotImplementedError
|
|
38
49
|
|
|
39
50
|
@classmethod
|
|
40
|
-
def
|
|
41
|
-
"""
|
|
42
|
-
|
|
51
|
+
def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
|
|
52
|
+
"""Save a media binary data to a file in a MediaStore. format is ignored for binary data."""
|
|
53
|
+
assert isinstance(file_data, bytes)
|
|
54
|
+
with open(dest_path, 'wb') as f:
|
|
55
|
+
f.write(file_data)
|
|
56
|
+
f.flush() # Ensures Python buffers are written to OS
|
|
57
|
+
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
58
|
+
return dest_path
|
|
43
59
|
|
|
44
60
|
@classmethod
|
|
45
|
-
def
|
|
61
|
+
def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
|
|
62
|
+
"""Save a PIL Image to a file in a MediaStore with the specified format."""
|
|
63
|
+
if dest_path.suffix != f'.{format}':
|
|
64
|
+
dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
|
|
65
|
+
|
|
66
|
+
with open(dest_path, 'wb') as f:
|
|
67
|
+
image.save(f, format=format)
|
|
68
|
+
f.flush() # Ensures Python buffers are written to OS
|
|
69
|
+
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
70
|
+
return dest_path
|
|
71
|
+
|
|
72
|
+
def _prepare_media_path_raw(self, tbl_id: UUID, col_id: int, tbl_version: int, ext: Optional[str] = None) -> Path:
|
|
46
73
|
"""
|
|
47
74
|
Construct a new, unique Path name for a persisted media file, and create the parent directory
|
|
48
75
|
for the new Path if it does not already exist. The Path will reside in
|
|
49
76
|
the environment's media_dir.
|
|
50
77
|
"""
|
|
51
78
|
id_hex = uuid.uuid4().hex
|
|
52
|
-
parent =
|
|
79
|
+
parent = self.__base_dir / tbl_id.hex / id_hex[:2] / id_hex[:4]
|
|
53
80
|
parent.mkdir(parents=True, exist_ok=True)
|
|
54
|
-
return parent / f'{
|
|
81
|
+
return parent / f'{tbl_id.hex}_{col_id}_{tbl_version}_{id_hex}{ext or ""}'
|
|
55
82
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
83
|
+
def _prepare_media_path(self, col: Column, ext: Optional[str] = None) -> Path:
|
|
84
|
+
"""
|
|
85
|
+
Construct a new, unique Path name for a persisted media file, and create the parent directory
|
|
86
|
+
for the new Path if it does not already exist. The Path will reside in
|
|
87
|
+
the environment's media_dir.
|
|
88
|
+
"""
|
|
89
|
+
assert col.tbl is not None, 'Column must be associated with a table'
|
|
90
|
+
return self._prepare_media_path_raw(col.tbl.id, col.id, col.tbl.version, ext)
|
|
91
|
+
|
|
92
|
+
def resolve_url(self, file_url: Optional[str]) -> Optional[Path]:
|
|
93
|
+
"""Return path if the given url refers to a file managed by this MediaStore, else None.
|
|
59
94
|
|
|
60
95
|
Args:
|
|
61
|
-
file_url: URL
|
|
96
|
+
file_url: URL to check
|
|
62
97
|
|
|
63
98
|
Returns:
|
|
64
|
-
If the
|
|
99
|
+
If the url is a managed file, return a Path() to the file, None, otherwise
|
|
65
100
|
"""
|
|
66
101
|
if file_url is None:
|
|
67
102
|
return None
|
|
@@ -74,93 +109,76 @@ class MediaStore:
|
|
|
74
109
|
# remote url
|
|
75
110
|
return None
|
|
76
111
|
src_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
|
|
77
|
-
|
|
78
|
-
if not src_path.startswith(pxt_tmp_dir):
|
|
112
|
+
if not src_path.startswith(str(self.__base_dir)):
|
|
79
113
|
# not a tmp file
|
|
80
114
|
return None
|
|
81
115
|
return Path(src_path)
|
|
82
116
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
dest_path = cls._prepare_media_path(col, ext=src_path.suffix)
|
|
117
|
+
def relocate_local_media_file(self, src_path: Path, col: Column) -> str:
|
|
118
|
+
"""Relocate a local file to a MediaStore, and return its new URL"""
|
|
119
|
+
dest_path = self._prepare_media_path(col, ext=src_path.suffix)
|
|
87
120
|
src_path.rename(dest_path)
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
121
|
+
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
122
|
+
_logger.debug(f'Media Storage: moved {src_path} to {new_file_url}')
|
|
123
|
+
return new_file_url
|
|
124
|
+
|
|
125
|
+
def copy_local_media_file(self, src_path: Path, col: Column) -> str:
|
|
126
|
+
"""Copy a local file to a MediaStore, and return its new URL"""
|
|
127
|
+
dest_path = self._prepare_media_path(col, ext=src_path.suffix)
|
|
128
|
+
shutil.copy2(src_path, dest_path)
|
|
129
|
+
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
130
|
+
_logger.debug(f'Media Storage: copied {src_path} to {new_file_url}')
|
|
131
|
+
return new_file_url
|
|
132
|
+
|
|
133
|
+
def save_media_object(self, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
|
|
134
|
+
"""Save a media data object to a file in a MediaStore
|
|
93
135
|
Returns:
|
|
94
136
|
dest_path: Path to the saved media file
|
|
95
137
|
url: URL of the saved media file
|
|
96
138
|
"""
|
|
97
139
|
assert col.col_type.is_media_type(), f'MediaStore: request to store non media_type Column {col.name}'
|
|
98
|
-
dest_path =
|
|
140
|
+
dest_path = self._prepare_media_path(col)
|
|
99
141
|
if isinstance(data, bytes):
|
|
100
|
-
dest_path =
|
|
142
|
+
dest_path = self._save_binary_media_file(data, dest_path, format)
|
|
101
143
|
elif isinstance(data, PIL.Image.Image):
|
|
102
|
-
dest_path =
|
|
144
|
+
dest_path = self._save_pil_image_file(data, dest_path, format)
|
|
103
145
|
else:
|
|
104
146
|
raise ValueError(f'Unsupported media object type: {type(data)}')
|
|
105
|
-
|
|
106
|
-
return dest_path,
|
|
147
|
+
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
148
|
+
return dest_path, new_file_url
|
|
107
149
|
|
|
108
|
-
|
|
109
|
-
def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
|
|
110
|
-
"""Save a media binary data to a file in the MediaStore. format is ignored for binary data."""
|
|
111
|
-
assert isinstance(file_data, bytes)
|
|
112
|
-
with open(dest_path, 'wb') as f:
|
|
113
|
-
f.write(file_data)
|
|
114
|
-
f.flush() # Ensures Python buffers are written to OS
|
|
115
|
-
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
116
|
-
return dest_path
|
|
117
|
-
|
|
118
|
-
@classmethod
|
|
119
|
-
def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
|
|
120
|
-
"""Save a PIL Image to a file in the MediaStore with the specified format."""
|
|
121
|
-
if dest_path.suffix != f'.{format}':
|
|
122
|
-
dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
|
|
123
|
-
|
|
124
|
-
with open(dest_path, 'wb') as f:
|
|
125
|
-
image.save(f, format=format)
|
|
126
|
-
f.flush() # Ensures Python buffers are written to OS
|
|
127
|
-
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
128
|
-
return dest_path
|
|
129
|
-
|
|
130
|
-
@classmethod
|
|
131
|
-
def delete(cls, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
|
|
150
|
+
def delete(self, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
|
|
132
151
|
"""Delete all files belonging to tbl_id. If tbl_version is not None, delete
|
|
133
152
|
only those files belonging to the specified tbl_version."""
|
|
134
153
|
assert tbl_id is not None
|
|
135
154
|
if tbl_version is None:
|
|
136
155
|
# Remove the entire folder for this table id.
|
|
137
|
-
path =
|
|
156
|
+
path = self.__base_dir / tbl_id.hex
|
|
138
157
|
if path.exists():
|
|
139
158
|
shutil.rmtree(path)
|
|
140
159
|
else:
|
|
141
160
|
# Remove only the elements for the specified tbl_version.
|
|
142
|
-
paths = glob.glob(
|
|
143
|
-
str(cls._media_dir() / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True
|
|
144
|
-
)
|
|
161
|
+
paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True)
|
|
145
162
|
for p in paths:
|
|
146
163
|
os.remove(p)
|
|
147
164
|
|
|
148
|
-
|
|
149
|
-
def count(cls, tbl_id: UUID) -> int:
|
|
165
|
+
def count(self, tbl_id: Optional[UUID]) -> int:
|
|
150
166
|
"""
|
|
151
167
|
Return number of files for given tbl_id.
|
|
152
168
|
"""
|
|
153
|
-
|
|
169
|
+
if tbl_id is None:
|
|
170
|
+
paths = glob.glob(str(self.__base_dir / '*'), recursive=True)
|
|
171
|
+
else:
|
|
172
|
+
paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
|
|
154
173
|
return len(paths)
|
|
155
174
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
paths = glob.glob(str(cls._media_dir()) + '/**', recursive=True)
|
|
175
|
+
def stats(self) -> list[tuple[UUID, int, int, int]]:
|
|
176
|
+
paths = glob.glob(str(self.__base_dir) + '/**', recursive=True)
|
|
159
177
|
# key: (tbl_id, col_id), value: (num_files, size)
|
|
160
178
|
d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
|
|
161
179
|
for p in paths:
|
|
162
180
|
if not os.path.isdir(p):
|
|
163
|
-
matched = re.match(
|
|
181
|
+
matched = re.match(self.pattern, Path(p).name)
|
|
164
182
|
assert matched is not None
|
|
165
183
|
tbl_id, col_id = UUID(hex=matched[1]), int(matched[2])
|
|
166
184
|
file_info = os.stat(p)
|
|
@@ -170,3 +188,50 @@ class MediaStore:
|
|
|
170
188
|
result = [(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()]
|
|
171
189
|
result.sort(key=lambda e: e[3], reverse=True)
|
|
172
190
|
return result
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class TempStore:
|
|
194
|
+
"""
|
|
195
|
+
A temporary store for files of data that are not yet persisted to their destination(s).
|
|
196
|
+
A destination is typically either a MediaStore (local persisted files) or a cloud object store.
|
|
197
|
+
|
|
198
|
+
The TempStore class has no internal state. It provides functionality to manage temporary files
|
|
199
|
+
in the env.Env.get().tmp_dir directory.
|
|
200
|
+
It reuses some of the MediaStore functionality to create unique file names and save objects.
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
@classmethod
|
|
204
|
+
def _tmp_dir(cls) -> Path:
|
|
205
|
+
"""Returns the path to the temporary directory where files are stored."""
|
|
206
|
+
from pixeltable import env
|
|
207
|
+
|
|
208
|
+
return env.Env.get().tmp_dir
|
|
209
|
+
|
|
210
|
+
@classmethod
|
|
211
|
+
def count(cls, tbl_id: Optional[UUID] = None) -> int:
|
|
212
|
+
return MediaStore(cls._tmp_dir()).count(tbl_id)
|
|
213
|
+
|
|
214
|
+
@classmethod
|
|
215
|
+
def resolve_url(cls, file_url: Optional[str]) -> Optional[Path]:
|
|
216
|
+
return MediaStore(cls._tmp_dir()).resolve_url(file_url)
|
|
217
|
+
|
|
218
|
+
@classmethod
|
|
219
|
+
def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
|
|
220
|
+
return MediaStore(cls._tmp_dir()).save_media_object(data, col, format)
|
|
221
|
+
|
|
222
|
+
@classmethod
|
|
223
|
+
def delete_media_file(cls, obj_path: Path) -> None:
|
|
224
|
+
"""Delete a media object from the temporary store."""
|
|
225
|
+
assert obj_path is not None, 'Object path must be provided'
|
|
226
|
+
assert obj_path.exists(), f'Object path does not exist: {obj_path}'
|
|
227
|
+
assert cls.resolve_url(str(obj_path)) is not None, f'Object path is not a valid media store path: {obj_path}'
|
|
228
|
+
obj_path.unlink()
|
|
229
|
+
|
|
230
|
+
@classmethod
|
|
231
|
+
def create_path(cls, tbl_id: Optional[UUID] = None, extension: str = '') -> Path:
|
|
232
|
+
"""Return a new, unique Path located in the temporary store.
|
|
233
|
+
If tbl_id is provided, the path name will be similar to a MediaStore path based on the tbl_id.
|
|
234
|
+
If tbl_id is None, a random UUID will be used to create the path."""
|
|
235
|
+
if tbl_id is not None:
|
|
236
|
+
return MediaStore(cls._tmp_dir())._prepare_media_path_raw(tbl_id, 0, 0, extension)
|
|
237
|
+
return cls._tmp_dir() / f'{uuid.uuid4()}{extension}'
|