pixeltable 0.4.7__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (50) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/catalog/catalog.py +4 -6
  3. pixeltable/catalog/insertable_table.py +125 -28
  4. pixeltable/catalog/table.py +51 -15
  5. pixeltable/catalog/table_version.py +12 -8
  6. pixeltable/catalog/table_version_path.py +6 -5
  7. pixeltable/config.py +25 -9
  8. pixeltable/dataframe.py +3 -3
  9. pixeltable/env.py +89 -20
  10. pixeltable/exec/aggregation_node.py +1 -1
  11. pixeltable/exec/cache_prefetch_node.py +4 -3
  12. pixeltable/exec/exec_node.py +0 -8
  13. pixeltable/exec/expr_eval/globals.py +1 -0
  14. pixeltable/exec/expr_eval/schedulers.py +16 -4
  15. pixeltable/exec/in_memory_data_node.py +2 -3
  16. pixeltable/exprs/data_row.py +5 -5
  17. pixeltable/exprs/function_call.py +59 -21
  18. pixeltable/exprs/row_builder.py +11 -5
  19. pixeltable/func/expr_template_function.py +6 -3
  20. pixeltable/functions/__init__.py +2 -0
  21. pixeltable/functions/anthropic.py +1 -2
  22. pixeltable/functions/deepseek.py +5 -1
  23. pixeltable/functions/gemini.py +11 -2
  24. pixeltable/functions/huggingface.py +6 -12
  25. pixeltable/functions/openai.py +2 -1
  26. pixeltable/functions/video.py +5 -5
  27. pixeltable/functions/whisperx.py +177 -0
  28. pixeltable/{ext/functions → functions}/yolox.py +0 -4
  29. pixeltable/globals.py +16 -3
  30. pixeltable/io/fiftyone.py +3 -3
  31. pixeltable/io/label_studio.py +2 -1
  32. pixeltable/iterators/audio.py +3 -2
  33. pixeltable/iterators/document.py +0 -6
  34. pixeltable/metadata/__init__.py +3 -1
  35. pixeltable/mypy/__init__.py +3 -0
  36. pixeltable/mypy/mypy_plugin.py +123 -0
  37. pixeltable/plan.py +0 -16
  38. pixeltable/share/packager.py +6 -6
  39. pixeltable/share/publish.py +134 -7
  40. pixeltable/type_system.py +20 -4
  41. pixeltable/utils/media_store.py +131 -66
  42. pixeltable/utils/pydantic.py +60 -0
  43. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/METADATA +186 -121
  44. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/RECORD +47 -46
  45. pixeltable/ext/__init__.py +0 -17
  46. pixeltable/ext/functions/__init__.py +0 -11
  47. pixeltable/ext/functions/whisperx.py +0 -77
  48. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/WHEEL +0 -0
  49. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/entry_points.txt +0 -0
  50. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,36 +1,45 @@
1
+ import os
1
2
  import sys
2
3
  import urllib.parse
3
4
  import urllib.request
4
5
  from pathlib import Path
6
+ from typing import Literal, Optional
5
7
 
6
8
  import requests
9
+ from requests.adapters import HTTPAdapter
7
10
  from tqdm import tqdm
11
+ from urllib3.util.retry import Retry
8
12
 
9
13
  import pixeltable as pxt
10
14
  from pixeltable import exceptions as excs
11
15
  from pixeltable.env import Env
12
16
  from pixeltable.utils import sha256sum
17
+ from pixeltable.utils.media_store import TempStore
13
18
 
14
19
  from .packager import TablePackager, TableRestorer
15
20
 
16
21
  # These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
17
22
  # pixeltable.com URLs are available.
18
23
 
19
- PIXELTABLE_API_URL = 'https://internal-api.pixeltable.com'
24
+ PIXELTABLE_API_URL = os.environ.get('PIXELTABLE_API_URL', 'https://internal-api.pixeltable.com')
20
25
 
21
26
 
22
- def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
23
- if not src_tbl._tbl_version.get().is_snapshot:
27
+ def push_replica(
28
+ dest_tbl_uri: str, src_tbl: pxt.Table, bucket: str | None = None, access: Literal['public', 'private'] = 'private'
29
+ ) -> str:
30
+ if not src_tbl._tbl_version_path.is_snapshot():
24
31
  raise excs.Error('Only snapshots may be published.')
25
32
 
26
- packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
33
+ packager = TablePackager(
34
+ src_tbl, additional_md={'table_uri': dest_tbl_uri, 'bucket_name': bucket, 'is_public': access == 'public'}
35
+ )
27
36
  request_json = packager.md | {'operation_type': 'publish_snapshot'}
28
37
  headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
29
38
  response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=headers_json)
30
39
  if response.status_code != 200:
31
40
  raise excs.Error(f'Error publishing snapshot: {response.text}')
32
41
  response_json = response.json()
33
- if not isinstance(response_json, dict) or response_json.get('destination') != 's3':
42
+ if not isinstance(response_json, dict):
34
43
  raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
35
44
  upload_id = response_json['upload_id']
36
45
  destination_uri = response_json['destination_uri']
@@ -42,17 +51,23 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
42
51
  parsed_location = urllib.parse.urlparse(destination_uri)
43
52
  if parsed_location.scheme == 's3':
44
53
  _upload_bundle_to_s3(bundle, parsed_location)
54
+ elif parsed_location.scheme == 'https':
55
+ _upload_to_presigned_url(file_path=bundle, url=parsed_location.geturl())
45
56
  else:
46
57
  raise excs.Error(f'Unsupported destination: {destination_uri}')
47
58
 
48
59
  Env.get().console_logger.info('Finalizing snapshot ...')
49
60
 
50
61
  finalize_request_json = {
62
+ 'table_uri': dest_tbl_uri,
51
63
  'operation_type': 'finalize_snapshot',
52
64
  'upload_id': upload_id,
53
65
  'datafile': bundle.name,
54
66
  'size': bundle.stat().st_size,
55
67
  'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
68
+ 'rows': packager.md['row_count'], # TODO rename rows to row_count once cloud side changes are complete
69
+ 'preview_header': packager.md['preview_header'],
70
+ 'preview_data': packager.md['preview_data'],
56
71
  }
57
72
  # TODO: Use Pydantic for validation
58
73
  finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=headers_json)
@@ -107,11 +122,14 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
107
122
  raise excs.Error(f'Error cloning shapshot: unexpected response from server.\n{response_json}')
108
123
 
109
124
  primary_tbl_additional_md = response_json['md']['tables'][0]['table_md']['additional_md']
110
- bundle_uri = primary_tbl_additional_md['destination_uri']
125
+ bundle_uri = response_json['destination_uri']
111
126
  bundle_filename = primary_tbl_additional_md['datafile']
112
127
  parsed_location = urllib.parse.urlparse(bundle_uri)
113
128
  if parsed_location.scheme == 's3':
114
129
  bundle_path = _download_bundle_from_s3(parsed_location, bundle_filename)
130
+ elif parsed_location.scheme == 'https':
131
+ bundle_path = TempStore.create_path()
132
+ _download_from_presigned_url(url=parsed_location.geturl(), output_path=bundle_path)
115
133
  else:
116
134
  raise excs.Error(f'Unexpected response from server: unsupported bundle uri: {bundle_uri}')
117
135
 
@@ -136,7 +154,7 @@ def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_f
136
154
  obj = s3_client.head_object(Bucket=bucket, Key=remote_path) # Check if the object exists
137
155
  bundle_size = obj['ContentLength']
138
156
 
139
- bundle_path = Path(Env.get().create_tmp_path())
157
+ bundle_path = TempStore.create_path()
140
158
  progress_bar = tqdm(
141
159
  desc='Downloading',
142
160
  total=bundle_size,
@@ -149,3 +167,112 @@ def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_f
149
167
  )
150
168
  s3_client.download_file(Bucket=bucket, Key=remote_path, Filename=str(bundle_path), Callback=progress_bar.update)
151
169
  return bundle_path
170
+
171
+
172
+ def _create_retry_session(
173
+ max_retries: int = 3, backoff_factor: float = 1.0, status_forcelist: Optional[list] = None
174
+ ) -> requests.Session:
175
+ """Create a requests session with retry configuration"""
176
+ if status_forcelist is None:
177
+ status_forcelist = [
178
+ 408, # Request Timeout
179
+ 429, # Too Many Requests (rate limiting)
180
+ 500, # Internal Server Error (server-side error)
181
+ 502, # Bad Gateway (proxy/gateway got invalid response)
182
+ 503, # Service Unavailable (server overloaded or down)
183
+ 504, # Gateway Timeout (proxy/gateway timeout)
184
+ ]
185
+ retry_strategy = Retry(
186
+ total=max_retries,
187
+ read=max_retries,
188
+ connect=max_retries,
189
+ backoff_factor=backoff_factor,
190
+ status_forcelist=status_forcelist,
191
+ allowed_methods=['GET', 'PUT', 'POST', 'DELETE'],
192
+ )
193
+
194
+ session = requests.Session()
195
+ adapter = HTTPAdapter(max_retries=retry_strategy)
196
+ session.mount('https://', adapter)
197
+ return session
198
+
199
+
200
+ def _upload_to_presigned_url(file_path: Path, url: str, max_retries: int = 3) -> requests.Response:
201
+ """Upload file with progress bar and retries"""
202
+ file_size = file_path.stat().st_size
203
+
204
+ headers = {'Content-Length': str(file_size), 'Content-Type': 'application/octet-stream'}
205
+
206
+ session = _create_retry_session(max_retries=max_retries)
207
+ try:
208
+ with (
209
+ open(file_path, 'rb') as f,
210
+ tqdm.wrapattr(
211
+ f,
212
+ method='read',
213
+ total=file_size,
214
+ desc='Uploading',
215
+ unit='B',
216
+ unit_scale=True,
217
+ unit_divisor=1024,
218
+ miniters=1, # Update every iteration (should be fine for an upload)
219
+ ncols=100,
220
+ file=sys.stdout,
221
+ ) as file_with_progress,
222
+ ):
223
+ response = session.put(
224
+ url,
225
+ data=file_with_progress,
226
+ headers=headers,
227
+ timeout=(60, 1800), # 60 seconds to connect and 300 seconds for server response
228
+ )
229
+ response.raise_for_status()
230
+ return response
231
+ finally:
232
+ session.close()
233
+
234
+
235
+ def _download_from_presigned_url(
236
+ url: str, output_path: Path, headers: Optional[dict[str, str]] = None, max_retries: int = 3
237
+ ) -> None:
238
+ """Download file with progress bar and retries"""
239
+ session = _create_retry_session(max_retries=max_retries)
240
+
241
+ try:
242
+ # Stream download with progress
243
+ response = session.get(
244
+ url, headers=headers, stream=True, timeout=(60, 300)
245
+ ) # 60 seconds to connect and 300 seconds for server response
246
+ response.raise_for_status()
247
+
248
+ total_size = int(response.headers.get('content-length', 0))
249
+ progress_bar = tqdm(
250
+ desc='Downloading',
251
+ total=total_size,
252
+ unit='B',
253
+ unit_scale=True,
254
+ unit_divisor=1024,
255
+ miniters=1,
256
+ ncols=100,
257
+ file=sys.stdout,
258
+ )
259
+ with open(output_path, 'wb') as f:
260
+ for chunk in response.iter_content(chunk_size=8192):
261
+ if chunk:
262
+ f.write(chunk)
263
+ progress_bar.update(len(chunk))
264
+ finally:
265
+ session.close()
266
+
267
+
268
+ # TODO: This will be replaced by drop_table with cloud table uri
269
+ def delete_replica(dest_path: str) -> None:
270
+ """Delete cloud replica"""
271
+ headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
272
+ delete_request_json = {'operation_type': 'delete_snapshot', 'table_uri': dest_path}
273
+ response = requests.post(PIXELTABLE_API_URL, json=delete_request_json, headers=headers_json)
274
+ if response.status_code != 200:
275
+ raise excs.Error(f'Error deleting replica: {response.text}')
276
+ response_json = response.json()
277
+ if not isinstance(response_json, dict) or 'table_uri' not in response_json:
278
+ raise excs.Error(f'Error deleting replica: unexpected response from server.\n{response_json}')
pixeltable/type_system.py CHANGED
@@ -9,8 +9,11 @@ import types
9
9
  import typing
10
10
  import urllib.parse
11
11
  import urllib.request
12
+ from pathlib import Path
12
13
  from typing import Any, ClassVar, Iterable, Literal, Mapping, Optional, Sequence, Union
13
14
 
15
+ from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
16
+
14
17
  import av
15
18
  import jsonschema
16
19
  import jsonschema.protocols
@@ -24,8 +27,6 @@ from typing_extensions import _AnnotatedAlias
24
27
  import pixeltable.exceptions as excs
25
28
  from pixeltable.utils import parse_local_file_path
26
29
 
27
- from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
28
-
29
30
 
30
31
  class ColumnType:
31
32
  @enum.unique
@@ -292,7 +293,11 @@ class ColumnType:
292
293
 
293
294
  @classmethod
294
295
  def from_python_type(
295
- cls, t: type | _GenericAlias, nullable_default: bool = False, allow_builtin_types: bool = True
296
+ cls,
297
+ t: type | _GenericAlias,
298
+ nullable_default: bool = False,
299
+ allow_builtin_types: bool = True,
300
+ infer_pydantic_json: bool = False,
296
301
  ) -> Optional[ColumnType]:
297
302
  """
298
303
  Convert a Python type into a Pixeltable `ColumnType` instance.
@@ -305,6 +310,8 @@ class ColumnType:
305
310
  allowed (as in UDF definitions). If False, then only Pixeltable types such as `pxt.String`,
306
311
  `pxt.Int`, etc., will be allowed (as in schema definitions). `Optional` and `Required`
307
312
  designations will be allowed regardless.
313
+ infer_pydantic_json: If True, accepts an extended set of built-ins (eg, Enum, Path) and returns the type to
314
+ which pydantic.BaseModel.model_dump(mode='json') serializes it.
308
315
  """
309
316
  origin = typing.get_origin(t)
310
317
  type_args = typing.get_args(t)
@@ -314,7 +321,9 @@ class ColumnType:
314
321
  # `t` is a type of the form Optional[T] (equivalently, T | None or None | T).
315
322
  # We treat it as the underlying type but with nullable=True.
316
323
  underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
317
- underlying = cls.from_python_type(underlying_py_type, allow_builtin_types=allow_builtin_types)
324
+ underlying = cls.from_python_type(
325
+ underlying_py_type, allow_builtin_types=allow_builtin_types, infer_pydantic_json=infer_pydantic_json
326
+ )
318
327
  if underlying is not None:
319
328
  return underlying.copy(nullable=True)
320
329
  elif origin is Required:
@@ -341,6 +350,13 @@ class ColumnType:
341
350
  if literal_type is None:
342
351
  return None
343
352
  return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
353
+ if infer_pydantic_json and isinstance(t, type) and issubclass(t, enum.Enum):
354
+ literal_type = cls.infer_common_literal_type(member.value for member in t)
355
+ if literal_type is None:
356
+ return None
357
+ return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
358
+ if infer_pydantic_json and t is Path:
359
+ return StringType(nullable=nullable_default)
344
360
  if t is str:
345
361
  return StringType(nullable=nullable_default)
346
362
  if t is int:
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import glob
4
+ import logging
4
5
  import os
5
6
  import re
6
7
  import shutil
@@ -19,10 +20,12 @@ from pixeltable import env
19
20
  if TYPE_CHECKING:
20
21
  from pixeltable.catalog import Column
21
22
 
23
+ _logger = logging.getLogger('pixeltable')
24
+
22
25
 
23
26
  class MediaStore:
24
27
  """
25
- Utilities to manage media files stored in Env.media_dir
28
+ Utilities to manage media files stored in a local filesystem directory.
26
29
 
27
30
  Media file names are a composite of: table id, column id, tbl_version, new uuid:
28
31
  the table id/column id/tbl_version are redundant but useful for identifying all files for a table
@@ -30,38 +33,70 @@ class MediaStore:
30
33
  """
31
34
 
32
35
  pattern = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
36
+ __base_dir: Path
37
+
38
+ def __init__(self, base_dir: Path):
39
+ """Initialize a MediaStore with a base directory."""
40
+ assert isinstance(base_dir, Path), 'Base directory must be a Path instance.'
41
+ self.__base_dir = base_dir
33
42
 
34
43
  @classmethod
35
- def _media_dir(cls) -> Path:
36
- """Returns the media directory path."""
37
- return env.Env.get().media_dir
44
+ def get(cls, base_uri: Optional[Path] = None) -> MediaStore:
45
+ """Get a MediaStore instance for the given base URI, or the environment's media_dir if None."""
46
+ if base_uri is None:
47
+ return MediaStore(env.Env.get().media_dir)
48
+ raise NotImplementedError
38
49
 
39
50
  @classmethod
40
- def _tmp_dir(cls) -> Path:
41
- """Returns the temporary directory path."""
42
- return env.Env.get().tmp_dir
51
+ def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
52
+ """Save a media binary data to a file in a MediaStore. format is ignored for binary data."""
53
+ assert isinstance(file_data, bytes)
54
+ with open(dest_path, 'wb') as f:
55
+ f.write(file_data)
56
+ f.flush() # Ensures Python buffers are written to OS
57
+ os.fsync(f.fileno()) # Forces OS to write to physical storage
58
+ return dest_path
43
59
 
44
60
  @classmethod
45
- def _prepare_media_path(cls, col: Column, ext: Optional[str] = None) -> Path:
61
+ def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
62
+ """Save a PIL Image to a file in a MediaStore with the specified format."""
63
+ if dest_path.suffix != f'.{format}':
64
+ dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
65
+
66
+ with open(dest_path, 'wb') as f:
67
+ image.save(f, format=format)
68
+ f.flush() # Ensures Python buffers are written to OS
69
+ os.fsync(f.fileno()) # Forces OS to write to physical storage
70
+ return dest_path
71
+
72
+ def _prepare_media_path_raw(self, tbl_id: UUID, col_id: int, tbl_version: int, ext: Optional[str] = None) -> Path:
46
73
  """
47
74
  Construct a new, unique Path name for a persisted media file, and create the parent directory
48
75
  for the new Path if it does not already exist. The Path will reside in
49
76
  the environment's media_dir.
50
77
  """
51
78
  id_hex = uuid.uuid4().hex
52
- parent = cls._media_dir() / col.tbl.id.hex / id_hex[:2] / id_hex[:4]
79
+ parent = self.__base_dir / tbl_id.hex / id_hex[:2] / id_hex[:4]
53
80
  parent.mkdir(parents=True, exist_ok=True)
54
- return parent / f'{col.tbl.id.hex}_{col.id}_{col.tbl.version}_{id_hex}{ext or ""}'
81
+ return parent / f'{tbl_id.hex}_{col_id}_{tbl_version}_{id_hex}{ext or ""}'
55
82
 
56
- @classmethod
57
- def resolve_tmp_url(cls, file_url: Optional[str]) -> Optional[Path]:
58
- """Return path if the given url is a tmp file.
83
+ def _prepare_media_path(self, col: Column, ext: Optional[str] = None) -> Path:
84
+ """
85
+ Construct a new, unique Path name for a persisted media file, and create the parent directory
86
+ for the new Path if it does not already exist. The Path will reside in
87
+ the environment's media_dir.
88
+ """
89
+ assert col.tbl is not None, 'Column must be associated with a table'
90
+ return self._prepare_media_path_raw(col.tbl.id, col.id, col.tbl.version, ext)
91
+
92
+ def resolve_url(self, file_url: Optional[str]) -> Optional[Path]:
93
+ """Return path if the given url refers to a file managed by this MediaStore, else None.
59
94
 
60
95
  Args:
61
- file_url: URL of the tmp media file to check
96
+ file_url: URL to check
62
97
 
63
98
  Returns:
64
- If the file_url is a tmp file, return a Path() to the tmp file, None, otherwise
99
+ If the url is a managed file, return a Path() to the file, None, otherwise
65
100
  """
66
101
  if file_url is None:
67
102
  return None
@@ -74,93 +109,76 @@ class MediaStore:
74
109
  # remote url
75
110
  return None
76
111
  src_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
77
- pxt_tmp_dir = str(cls._tmp_dir())
78
- if not src_path.startswith(pxt_tmp_dir):
112
+ if not src_path.startswith(str(self.__base_dir)):
79
113
  # not a tmp file
80
114
  return None
81
115
  return Path(src_path)
82
116
 
83
- @classmethod
84
- def relocate_local_media_file(cls, src_path: Path, col: Column) -> str:
85
- """Relocate a local file to the MediaStore, and return its new URL"""
86
- dest_path = cls._prepare_media_path(col, ext=src_path.suffix)
117
+ def relocate_local_media_file(self, src_path: Path, col: Column) -> str:
118
+ """Relocate a local file to a MediaStore, and return its new URL"""
119
+ dest_path = self._prepare_media_path(col, ext=src_path.suffix)
87
120
  src_path.rename(dest_path)
88
- return urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
89
-
90
- @classmethod
91
- def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
92
- """Save a media data to a file in the MediaStore
121
+ new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
122
+ _logger.debug(f'Media Storage: moved {src_path} to {new_file_url}')
123
+ return new_file_url
124
+
125
+ def copy_local_media_file(self, src_path: Path, col: Column) -> str:
126
+ """Copy a local file to a MediaStore, and return its new URL"""
127
+ dest_path = self._prepare_media_path(col, ext=src_path.suffix)
128
+ shutil.copy2(src_path, dest_path)
129
+ new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
130
+ _logger.debug(f'Media Storage: copied {src_path} to {new_file_url}')
131
+ return new_file_url
132
+
133
+ def save_media_object(self, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
134
+ """Save a media data object to a file in a MediaStore
93
135
  Returns:
94
136
  dest_path: Path to the saved media file
95
137
  url: URL of the saved media file
96
138
  """
97
139
  assert col.col_type.is_media_type(), f'MediaStore: request to store non media_type Column {col.name}'
98
- dest_path = cls._prepare_media_path(col)
140
+ dest_path = self._prepare_media_path(col)
99
141
  if isinstance(data, bytes):
100
- dest_path = cls._save_binary_media_file(data, dest_path, format)
142
+ dest_path = self._save_binary_media_file(data, dest_path, format)
101
143
  elif isinstance(data, PIL.Image.Image):
102
- dest_path = cls._save_pil_image_file(data, dest_path, format)
144
+ dest_path = self._save_pil_image_file(data, dest_path, format)
103
145
  else:
104
146
  raise ValueError(f'Unsupported media object type: {type(data)}')
105
- url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
106
- return dest_path, url
147
+ new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
148
+ return dest_path, new_file_url
107
149
 
108
- @classmethod
109
- def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
110
- """Save a media binary data to a file in the MediaStore. format is ignored for binary data."""
111
- assert isinstance(file_data, bytes)
112
- with open(dest_path, 'wb') as f:
113
- f.write(file_data)
114
- f.flush() # Ensures Python buffers are written to OS
115
- os.fsync(f.fileno()) # Forces OS to write to physical storage
116
- return dest_path
117
-
118
- @classmethod
119
- def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
120
- """Save a PIL Image to a file in the MediaStore with the specified format."""
121
- if dest_path.suffix != f'.{format}':
122
- dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
123
-
124
- with open(dest_path, 'wb') as f:
125
- image.save(f, format=format)
126
- f.flush() # Ensures Python buffers are written to OS
127
- os.fsync(f.fileno()) # Forces OS to write to physical storage
128
- return dest_path
129
-
130
- @classmethod
131
- def delete(cls, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
150
+ def delete(self, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
132
151
  """Delete all files belonging to tbl_id. If tbl_version is not None, delete
133
152
  only those files belonging to the specified tbl_version."""
134
153
  assert tbl_id is not None
135
154
  if tbl_version is None:
136
155
  # Remove the entire folder for this table id.
137
- path = cls._media_dir() / tbl_id.hex
156
+ path = self.__base_dir / tbl_id.hex
138
157
  if path.exists():
139
158
  shutil.rmtree(path)
140
159
  else:
141
160
  # Remove only the elements for the specified tbl_version.
142
- paths = glob.glob(
143
- str(cls._media_dir() / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True
144
- )
161
+ paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True)
145
162
  for p in paths:
146
163
  os.remove(p)
147
164
 
148
- @classmethod
149
- def count(cls, tbl_id: UUID) -> int:
165
+ def count(self, tbl_id: Optional[UUID]) -> int:
150
166
  """
151
167
  Return number of files for given tbl_id.
152
168
  """
153
- paths = glob.glob(str(cls._media_dir() / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
169
+ if tbl_id is None:
170
+ paths = glob.glob(str(self.__base_dir / '*'), recursive=True)
171
+ else:
172
+ paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
154
173
  return len(paths)
155
174
 
156
- @classmethod
157
- def stats(cls) -> list[tuple[UUID, int, int, int]]:
158
- paths = glob.glob(str(cls._media_dir()) + '/**', recursive=True)
175
+ def stats(self) -> list[tuple[UUID, int, int, int]]:
176
+ paths = glob.glob(str(self.__base_dir) + '/**', recursive=True)
159
177
  # key: (tbl_id, col_id), value: (num_files, size)
160
178
  d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
161
179
  for p in paths:
162
180
  if not os.path.isdir(p):
163
- matched = re.match(cls.pattern, Path(p).name)
181
+ matched = re.match(self.pattern, Path(p).name)
164
182
  assert matched is not None
165
183
  tbl_id, col_id = UUID(hex=matched[1]), int(matched[2])
166
184
  file_info = os.stat(p)
@@ -170,3 +188,50 @@ class MediaStore:
170
188
  result = [(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()]
171
189
  result.sort(key=lambda e: e[3], reverse=True)
172
190
  return result
191
+
192
+
193
+ class TempStore:
194
+ """
195
+ A temporary store for files of data that are not yet persisted to their destination(s).
196
+ A destination is typically either a MediaStore (local persisted files) or a cloud object store.
197
+
198
+ The TempStore class has no internal state. It provides functionality to manage temporary files
199
+ in the env.Env.get().tmp_dir directory.
200
+ It reuses some of the MediaStore functionality to create unique file names and save objects.
201
+ """
202
+
203
+ @classmethod
204
+ def _tmp_dir(cls) -> Path:
205
+ """Returns the path to the temporary directory where files are stored."""
206
+ from pixeltable import env
207
+
208
+ return env.Env.get().tmp_dir
209
+
210
+ @classmethod
211
+ def count(cls, tbl_id: Optional[UUID] = None) -> int:
212
+ return MediaStore(cls._tmp_dir()).count(tbl_id)
213
+
214
+ @classmethod
215
+ def resolve_url(cls, file_url: Optional[str]) -> Optional[Path]:
216
+ return MediaStore(cls._tmp_dir()).resolve_url(file_url)
217
+
218
+ @classmethod
219
+ def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
220
+ return MediaStore(cls._tmp_dir()).save_media_object(data, col, format)
221
+
222
+ @classmethod
223
+ def delete_media_file(cls, obj_path: Path) -> None:
224
+ """Delete a media object from the temporary store."""
225
+ assert obj_path is not None, 'Object path must be provided'
226
+ assert obj_path.exists(), f'Object path does not exist: {obj_path}'
227
+ assert cls.resolve_url(str(obj_path)) is not None, f'Object path is not a valid media store path: {obj_path}'
228
+ obj_path.unlink()
229
+
230
+ @classmethod
231
+ def create_path(cls, tbl_id: Optional[UUID] = None, extension: str = '') -> Path:
232
+ """Return a new, unique Path located in the temporary store.
233
+ If tbl_id is provided, the path name will be similar to a MediaStore path based on the tbl_id.
234
+ If tbl_id is None, a random UUID will be used to create the path."""
235
+ if tbl_id is not None:
236
+ return MediaStore(cls._tmp_dir())._prepare_media_path_raw(tbl_id, 0, 0, extension)
237
+ return cls._tmp_dir() / f'{uuid.uuid4()}{extension}'
@@ -0,0 +1,60 @@
1
+ import typing
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from types import UnionType
5
+ from typing import Any, Union
6
+
7
+ import pydantic
8
+
9
+
10
+ def is_json_convertible(model: type[pydantic.BaseModel]) -> bool:
11
+ """
12
+ Determine if instances of a Pydantic model can be converted to valid JSON
13
+ based on the type hints of its fields.
14
+ """
15
+ type_hints = typing.get_type_hints(model)
16
+ return all(_type_is_json_convertible(field_type) for field_type in type_hints.values())
17
+
18
+
19
+ def _type_is_json_convertible(type_hint: Any) -> bool:
20
+ """
21
+ Recursively check if a type hint represents a JSON-compatible type.
22
+
23
+ TODO: also allow ndarrays and PIL.Image.Image, once we support those within json structures.
24
+ """
25
+ if type_hint is type(None):
26
+ return True
27
+ if type_hint is Any:
28
+ return False
29
+
30
+ if type_hint in (str, int, float, bool, datetime):
31
+ return True
32
+
33
+ if isinstance(type_hint, type) and issubclass(type_hint, Enum):
34
+ return all(isinstance(member.value, (str, int, float, bool, type(None))) for member in type_hint)
35
+
36
+ if isinstance(type_hint, type) and issubclass(type_hint, pydantic.BaseModel):
37
+ return is_json_convertible(type_hint)
38
+
39
+ origin = typing.get_origin(type_hint)
40
+ args = typing.get_args(type_hint)
41
+
42
+ if origin in (Union, UnionType):
43
+ return all(_type_is_json_convertible(arg) for arg in args)
44
+
45
+ if origin in (list, tuple):
46
+ return all(_type_is_json_convertible(arg) for arg in args) if len(args) > 0 else False
47
+
48
+ if origin is dict:
49
+ if len(args) != 2:
50
+ # we can't tell what this is
51
+ return False
52
+ key_type, value_type = args
53
+ # keys must be strings, values must be json-convertible
54
+ return key_type is str and _type_is_json_convertible(value_type)
55
+
56
+ # Literal types are json-convertible if their values are
57
+ if origin is typing.Literal:
58
+ return all(isinstance(val, (str, int, float, bool, type(None))) for val in args)
59
+
60
+ return False