pixeltable 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (69) hide show
  1. pixeltable/__init__.py +4 -2
  2. pixeltable/catalog/__init__.py +1 -1
  3. pixeltable/catalog/catalog.py +7 -9
  4. pixeltable/catalog/column.py +49 -0
  5. pixeltable/catalog/insertable_table.py +0 -7
  6. pixeltable/catalog/schema_object.py +1 -14
  7. pixeltable/catalog/table.py +180 -67
  8. pixeltable/catalog/table_version.py +42 -146
  9. pixeltable/catalog/table_version_path.py +6 -5
  10. pixeltable/catalog/view.py +2 -1
  11. pixeltable/config.py +24 -9
  12. pixeltable/dataframe.py +5 -6
  13. pixeltable/env.py +113 -21
  14. pixeltable/exec/aggregation_node.py +1 -1
  15. pixeltable/exec/cache_prefetch_node.py +4 -3
  16. pixeltable/exec/exec_node.py +0 -8
  17. pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
  18. pixeltable/exec/expr_eval/globals.py +1 -0
  19. pixeltable/exec/expr_eval/schedulers.py +52 -19
  20. pixeltable/exec/in_memory_data_node.py +2 -3
  21. pixeltable/exprs/array_slice.py +2 -2
  22. pixeltable/exprs/data_row.py +15 -2
  23. pixeltable/exprs/expr.py +9 -9
  24. pixeltable/exprs/function_call.py +61 -23
  25. pixeltable/exprs/globals.py +1 -2
  26. pixeltable/exprs/json_path.py +3 -3
  27. pixeltable/exprs/row_builder.py +25 -21
  28. pixeltable/exprs/string_op.py +3 -3
  29. pixeltable/func/expr_template_function.py +6 -3
  30. pixeltable/func/query_template_function.py +2 -2
  31. pixeltable/func/signature.py +30 -3
  32. pixeltable/func/tools.py +2 -2
  33. pixeltable/functions/anthropic.py +76 -27
  34. pixeltable/functions/deepseek.py +5 -1
  35. pixeltable/functions/gemini.py +11 -2
  36. pixeltable/functions/globals.py +2 -2
  37. pixeltable/functions/huggingface.py +6 -12
  38. pixeltable/functions/llama_cpp.py +9 -1
  39. pixeltable/functions/openai.py +76 -55
  40. pixeltable/functions/video.py +59 -6
  41. pixeltable/functions/vision.py +2 -2
  42. pixeltable/globals.py +86 -13
  43. pixeltable/io/datarows.py +3 -3
  44. pixeltable/io/fiftyone.py +7 -7
  45. pixeltable/io/globals.py +3 -3
  46. pixeltable/io/hf_datasets.py +4 -4
  47. pixeltable/io/label_studio.py +2 -1
  48. pixeltable/io/pandas.py +6 -6
  49. pixeltable/io/parquet.py +3 -3
  50. pixeltable/io/table_data_conduit.py +2 -2
  51. pixeltable/io/utils.py +2 -2
  52. pixeltable/iterators/audio.py +3 -2
  53. pixeltable/iterators/document.py +2 -8
  54. pixeltable/iterators/video.py +49 -9
  55. pixeltable/plan.py +0 -16
  56. pixeltable/share/packager.py +51 -42
  57. pixeltable/share/publish.py +134 -7
  58. pixeltable/store.py +5 -25
  59. pixeltable/type_system.py +5 -8
  60. pixeltable/utils/__init__.py +2 -2
  61. pixeltable/utils/arrow.py +5 -5
  62. pixeltable/utils/description_helper.py +3 -3
  63. pixeltable/utils/iceberg.py +1 -2
  64. pixeltable/utils/media_store.py +131 -66
  65. {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/METADATA +238 -122
  66. {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/RECORD +69 -69
  67. {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/WHEEL +0 -0
  68. {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/entry_points.txt +0 -0
  69. {pixeltable-0.4.6.dist-info → pixeltable-0.4.8.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import glob
4
+ import logging
4
5
  import os
5
6
  import re
6
7
  import shutil
@@ -19,10 +20,12 @@ from pixeltable import env
19
20
  if TYPE_CHECKING:
20
21
  from pixeltable.catalog import Column
21
22
 
23
+ _logger = logging.getLogger('pixeltable')
24
+
22
25
 
23
26
  class MediaStore:
24
27
  """
25
- Utilities to manage media files stored in Env.media_dir
28
+ Utilities to manage media files stored in a local filesystem directory.
26
29
 
27
30
  Media file names are a composite of: table id, column id, tbl_version, new uuid:
28
31
  the table id/column id/tbl_version are redundant but useful for identifying all files for a table
@@ -30,38 +33,70 @@ class MediaStore:
30
33
  """
31
34
 
32
35
  pattern = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
36
+ __base_dir: Path
37
+
38
+ def __init__(self, base_dir: Path):
39
+ """Initialize a MediaStore with a base directory."""
40
+ assert isinstance(base_dir, Path), 'Base directory must be a Path instance.'
41
+ self.__base_dir = base_dir
33
42
 
34
43
  @classmethod
35
- def _media_dir(cls) -> Path:
36
- """Returns the media directory path."""
37
- return env.Env.get().media_dir
44
+ def get(cls, base_uri: Optional[Path] = None) -> MediaStore:
45
+ """Get a MediaStore instance for the given base URI, or the environment's media_dir if None."""
46
+ if base_uri is None:
47
+ return MediaStore(env.Env.get().media_dir)
48
+ raise NotImplementedError
38
49
 
39
50
  @classmethod
40
- def _tmp_dir(cls) -> Path:
41
- """Returns the temporary directory path."""
42
- return env.Env.get().tmp_dir
51
+ def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
52
+ """Save a media binary data to a file in a MediaStore. format is ignored for binary data."""
53
+ assert isinstance(file_data, bytes)
54
+ with open(dest_path, 'wb') as f:
55
+ f.write(file_data)
56
+ f.flush() # Ensures Python buffers are written to OS
57
+ os.fsync(f.fileno()) # Forces OS to write to physical storage
58
+ return dest_path
43
59
 
44
60
  @classmethod
45
- def _prepare_media_path(cls, col: Column, ext: Optional[str] = None) -> Path:
61
+ def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
62
+ """Save a PIL Image to a file in a MediaStore with the specified format."""
63
+ if dest_path.suffix != f'.{format}':
64
+ dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
65
+
66
+ with open(dest_path, 'wb') as f:
67
+ image.save(f, format=format)
68
+ f.flush() # Ensures Python buffers are written to OS
69
+ os.fsync(f.fileno()) # Forces OS to write to physical storage
70
+ return dest_path
71
+
72
+ def _prepare_media_path_raw(self, tbl_id: UUID, col_id: int, tbl_version: int, ext: Optional[str] = None) -> Path:
46
73
  """
47
74
  Construct a new, unique Path name for a persisted media file, and create the parent directory
48
75
  for the new Path if it does not already exist. The Path will reside in
49
76
  the environment's media_dir.
50
77
  """
51
78
  id_hex = uuid.uuid4().hex
52
- parent = cls._media_dir() / col.tbl.id.hex / id_hex[:2] / id_hex[:4]
79
+ parent = self.__base_dir / tbl_id.hex / id_hex[:2] / id_hex[:4]
53
80
  parent.mkdir(parents=True, exist_ok=True)
54
- return parent / f'{col.tbl.id.hex}_{col.id}_{col.tbl.version}_{id_hex}{ext or ""}'
81
+ return parent / f'{tbl_id.hex}_{col_id}_{tbl_version}_{id_hex}{ext or ""}'
55
82
 
56
- @classmethod
57
- def resolve_tmp_url(cls, file_url: Optional[str]) -> Optional[Path]:
58
- """Return path if the given url is a tmp file.
83
+ def _prepare_media_path(self, col: Column, ext: Optional[str] = None) -> Path:
84
+ """
85
+ Construct a new, unique Path name for a persisted media file, and create the parent directory
86
+ for the new Path if it does not already exist. The Path will reside in
87
+ the environment's media_dir.
88
+ """
89
+ assert col.tbl is not None, 'Column must be associated with a table'
90
+ return self._prepare_media_path_raw(col.tbl.id, col.id, col.tbl.version, ext)
91
+
92
+ def resolve_url(self, file_url: Optional[str]) -> Optional[Path]:
93
+ """Return path if the given url refers to a file managed by this MediaStore, else None.
59
94
 
60
95
  Args:
61
- file_url: URL of the tmp media file to check
96
+ file_url: URL to check
62
97
 
63
98
  Returns:
64
- If the file_url is a tmp file, return a Path() to the tmp file, None, otherwise
99
+ If the url is a managed file, return a Path() to the file, None, otherwise
65
100
  """
66
101
  if file_url is None:
67
102
  return None
@@ -74,93 +109,76 @@ class MediaStore:
74
109
  # remote url
75
110
  return None
76
111
  src_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
77
- pxt_tmp_dir = str(cls._tmp_dir())
78
- if not src_path.startswith(pxt_tmp_dir):
112
+ if not src_path.startswith(str(self.__base_dir)):
79
113
  # not a tmp file
80
114
  return None
81
115
  return Path(src_path)
82
116
 
83
- @classmethod
84
- def relocate_local_media_file(cls, src_path: Path, col: Column) -> str:
85
- """Relocate a local file to the MediaStore, and return its new URL"""
86
- dest_path = cls._prepare_media_path(col, ext=src_path.suffix)
117
+ def relocate_local_media_file(self, src_path: Path, col: Column) -> str:
118
+ """Relocate a local file to a MediaStore, and return its new URL"""
119
+ dest_path = self._prepare_media_path(col, ext=src_path.suffix)
87
120
  src_path.rename(dest_path)
88
- return urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
89
-
90
- @classmethod
91
- def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
92
- """Save a media data to a file in the MediaStore
121
+ new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
122
+ _logger.debug(f'Media Storage: moved {src_path} to {new_file_url}')
123
+ return new_file_url
124
+
125
+ def copy_local_media_file(self, src_path: Path, col: Column) -> str:
126
+ """Copy a local file to a MediaStore, and return its new URL"""
127
+ dest_path = self._prepare_media_path(col, ext=src_path.suffix)
128
+ shutil.copy2(src_path, dest_path)
129
+ new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
130
+ _logger.debug(f'Media Storage: copied {src_path} to {new_file_url}')
131
+ return new_file_url
132
+
133
+ def save_media_object(self, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
134
+ """Save a media data object to a file in a MediaStore
93
135
  Returns:
94
136
  dest_path: Path to the saved media file
95
137
  url: URL of the saved media file
96
138
  """
97
139
  assert col.col_type.is_media_type(), f'MediaStore: request to store non media_type Column {col.name}'
98
- dest_path = cls._prepare_media_path(col)
140
+ dest_path = self._prepare_media_path(col)
99
141
  if isinstance(data, bytes):
100
- dest_path = cls._save_binary_media_file(data, dest_path, format)
142
+ dest_path = self._save_binary_media_file(data, dest_path, format)
101
143
  elif isinstance(data, PIL.Image.Image):
102
- dest_path = cls._save_pil_image_file(data, dest_path, format)
144
+ dest_path = self._save_pil_image_file(data, dest_path, format)
103
145
  else:
104
146
  raise ValueError(f'Unsupported media object type: {type(data)}')
105
- url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
106
- return dest_path, url
147
+ new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
148
+ return dest_path, new_file_url
107
149
 
108
- @classmethod
109
- def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
110
- """Save a media binary data to a file in the MediaStore. format is ignored for binary data."""
111
- assert isinstance(file_data, bytes)
112
- with open(dest_path, 'wb') as f:
113
- f.write(file_data)
114
- f.flush() # Ensures Python buffers are written to OS
115
- os.fsync(f.fileno()) # Forces OS to write to physical storage
116
- return dest_path
117
-
118
- @classmethod
119
- def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
120
- """Save a PIL Image to a file in the MediaStore with the specified format."""
121
- if dest_path.suffix != f'.{format}':
122
- dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
123
-
124
- with open(dest_path, 'wb') as f:
125
- image.save(f, format=format)
126
- f.flush() # Ensures Python buffers are written to OS
127
- os.fsync(f.fileno()) # Forces OS to write to physical storage
128
- return dest_path
129
-
130
- @classmethod
131
- def delete(cls, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
150
+ def delete(self, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
132
151
  """Delete all files belonging to tbl_id. If tbl_version is not None, delete
133
152
  only those files belonging to the specified tbl_version."""
134
153
  assert tbl_id is not None
135
154
  if tbl_version is None:
136
155
  # Remove the entire folder for this table id.
137
- path = cls._media_dir() / tbl_id.hex
156
+ path = self.__base_dir / tbl_id.hex
138
157
  if path.exists():
139
158
  shutil.rmtree(path)
140
159
  else:
141
160
  # Remove only the elements for the specified tbl_version.
142
- paths = glob.glob(
143
- str(cls._media_dir() / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True
144
- )
161
+ paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True)
145
162
  for p in paths:
146
163
  os.remove(p)
147
164
 
148
- @classmethod
149
- def count(cls, tbl_id: UUID) -> int:
165
+ def count(self, tbl_id: Optional[UUID]) -> int:
150
166
  """
151
167
  Return number of files for given tbl_id.
152
168
  """
153
- paths = glob.glob(str(cls._media_dir() / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
169
+ if tbl_id is None:
170
+ paths = glob.glob(str(self.__base_dir / '*'), recursive=True)
171
+ else:
172
+ paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
154
173
  return len(paths)
155
174
 
156
- @classmethod
157
- def stats(cls) -> list[tuple[UUID, int, int, int]]:
158
- paths = glob.glob(str(cls._media_dir()) + '/**', recursive=True)
175
+ def stats(self) -> list[tuple[UUID, int, int, int]]:
176
+ paths = glob.glob(str(self.__base_dir) + '/**', recursive=True)
159
177
  # key: (tbl_id, col_id), value: (num_files, size)
160
178
  d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
161
179
  for p in paths:
162
180
  if not os.path.isdir(p):
163
- matched = re.match(cls.pattern, Path(p).name)
181
+ matched = re.match(self.pattern, Path(p).name)
164
182
  assert matched is not None
165
183
  tbl_id, col_id = UUID(hex=matched[1]), int(matched[2])
166
184
  file_info = os.stat(p)
@@ -170,3 +188,50 @@ class MediaStore:
170
188
  result = [(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()]
171
189
  result.sort(key=lambda e: e[3], reverse=True)
172
190
  return result
191
+
192
+
193
+ class TempStore:
194
+ """
195
+ A temporary store for files of data that are not yet persisted to their destination(s).
196
+ A destination is typically either a MediaStore (local persisted files) or a cloud object store.
197
+
198
+ The TempStore class has no internal state. It provides functionality to manage temporary files
199
+ in the env.Env.get().tmp_dir directory.
200
+ It reuses some of the MediaStore functionality to create unique file names and save objects.
201
+ """
202
+
203
+ @classmethod
204
+ def _tmp_dir(cls) -> Path:
205
+ """Returns the path to the temporary directory where files are stored."""
206
+ from pixeltable import env
207
+
208
+ return env.Env.get().tmp_dir
209
+
210
+ @classmethod
211
+ def count(cls, tbl_id: Optional[UUID] = None) -> int:
212
+ return MediaStore(cls._tmp_dir()).count(tbl_id)
213
+
214
+ @classmethod
215
+ def resolve_url(cls, file_url: Optional[str]) -> Optional[Path]:
216
+ return MediaStore(cls._tmp_dir()).resolve_url(file_url)
217
+
218
+ @classmethod
219
+ def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
220
+ return MediaStore(cls._tmp_dir()).save_media_object(data, col, format)
221
+
222
+ @classmethod
223
+ def delete_media_file(cls, obj_path: Path) -> None:
224
+ """Delete a media object from the temporary store."""
225
+ assert obj_path is not None, 'Object path must be provided'
226
+ assert obj_path.exists(), f'Object path does not exist: {obj_path}'
227
+ assert cls.resolve_url(str(obj_path)) is not None, f'Object path is not a valid media store path: {obj_path}'
228
+ obj_path.unlink()
229
+
230
+ @classmethod
231
+ def create_path(cls, tbl_id: Optional[UUID] = None, extension: str = '') -> Path:
232
+ """Return a new, unique Path located in the temporary store.
233
+ If tbl_id is provided, the path name will be similar to a MediaStore path based on the tbl_id.
234
+ If tbl_id is None, a random UUID will be used to create the path."""
235
+ if tbl_id is not None:
236
+ return MediaStore(cls._tmp_dir())._prepare_media_path_raw(tbl_id, 0, 0, extension)
237
+ return cls._tmp_dir() / f'{uuid.uuid4()}{extension}'