pixeltable 0.4.13__py3-none-any.whl → 0.4.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/catalog/catalog.py +179 -63
- pixeltable/catalog/column.py +24 -20
- pixeltable/catalog/table.py +24 -8
- pixeltable/catalog/table_version.py +15 -6
- pixeltable/catalog/view.py +22 -22
- pixeltable/config.py +2 -0
- pixeltable/dataframe.py +3 -2
- pixeltable/env.py +42 -21
- pixeltable/exec/__init__.py +1 -0
- pixeltable/exec/aggregation_node.py +0 -1
- pixeltable/exec/cache_prefetch_node.py +74 -98
- pixeltable/exec/data_row_batch.py +2 -18
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/object_store_save_node.py +299 -0
- pixeltable/exec/sql_node.py +28 -33
- pixeltable/exprs/data_row.py +31 -25
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/row_builder.py +6 -12
- pixeltable/functions/gemini.py +1 -1
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/video.py +5 -6
- pixeltable/globals.py +3 -3
- pixeltable/index/embedding_index.py +5 -8
- pixeltable/io/fiftyone.py +1 -1
- pixeltable/io/label_studio.py +4 -5
- pixeltable/iterators/audio.py +1 -1
- pixeltable/iterators/document.py +10 -12
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/schema.py +7 -0
- pixeltable/plan.py +26 -1
- pixeltable/share/packager.py +8 -2
- pixeltable/share/publish.py +3 -9
- pixeltable/type_system.py +1 -3
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/object_stores.py +497 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +354 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/METADATA +1 -1
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/RECORD +44 -41
- pixeltable/utils/media_store.py +0 -248
- pixeltable/utils/s3.py +0 -17
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
pixeltable/utils/media_store.py
DELETED
|
@@ -1,248 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import glob
|
|
4
|
-
import logging
|
|
5
|
-
import os
|
|
6
|
-
import re
|
|
7
|
-
import shutil
|
|
8
|
-
import urllib.parse
|
|
9
|
-
import urllib.request
|
|
10
|
-
import uuid
|
|
11
|
-
from collections import defaultdict
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
from typing import TYPE_CHECKING, Optional
|
|
14
|
-
from uuid import UUID
|
|
15
|
-
|
|
16
|
-
import PIL.Image
|
|
17
|
-
|
|
18
|
-
from pixeltable import env
|
|
19
|
-
|
|
20
|
-
if TYPE_CHECKING:
|
|
21
|
-
from pixeltable.catalog import Column
|
|
22
|
-
|
|
23
|
-
_logger = logging.getLogger('pixeltable')
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class MediaStore:
|
|
27
|
-
"""
|
|
28
|
-
Utilities to manage media files stored in a local filesystem directory.
|
|
29
|
-
|
|
30
|
-
Media file names are a composite of: table id, column id, tbl_version, new uuid:
|
|
31
|
-
the table id/column id/tbl_version are redundant but useful for identifying all files for a table
|
|
32
|
-
or all files created for a particular version of a table
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
pattern = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
|
|
36
|
-
__base_dir: Path
|
|
37
|
-
|
|
38
|
-
def __init__(self, base_dir: Path):
|
|
39
|
-
"""Initialize a MediaStore with a base directory."""
|
|
40
|
-
assert isinstance(base_dir, Path), 'Base directory must be a Path instance.'
|
|
41
|
-
self.__base_dir = base_dir
|
|
42
|
-
|
|
43
|
-
@classmethod
|
|
44
|
-
def get(cls, base_uri: Optional[Path] = None) -> MediaStore:
|
|
45
|
-
"""Get a MediaStore instance for the given base URI, or the environment's media_dir if None."""
|
|
46
|
-
if base_uri is None:
|
|
47
|
-
return MediaStore(env.Env.get().media_dir)
|
|
48
|
-
raise NotImplementedError
|
|
49
|
-
|
|
50
|
-
@classmethod
|
|
51
|
-
def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
|
|
52
|
-
"""Save a media binary data to a file in a MediaStore. format is ignored for binary data."""
|
|
53
|
-
assert isinstance(file_data, bytes)
|
|
54
|
-
with open(dest_path, 'wb') as f:
|
|
55
|
-
f.write(file_data)
|
|
56
|
-
f.flush() # Ensures Python buffers are written to OS
|
|
57
|
-
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
58
|
-
return dest_path
|
|
59
|
-
|
|
60
|
-
@classmethod
|
|
61
|
-
def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
|
|
62
|
-
"""Save a PIL Image to a file in a MediaStore with the specified format."""
|
|
63
|
-
if dest_path.suffix != f'.{format}':
|
|
64
|
-
dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
|
|
65
|
-
|
|
66
|
-
with open(dest_path, 'wb') as f:
|
|
67
|
-
image.save(f, format=format)
|
|
68
|
-
f.flush() # Ensures Python buffers are written to OS
|
|
69
|
-
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
70
|
-
return dest_path
|
|
71
|
-
|
|
72
|
-
def _prepare_media_path_raw(self, tbl_id: UUID, col_id: int, tbl_version: int, ext: Optional[str] = None) -> Path:
|
|
73
|
-
"""
|
|
74
|
-
Construct a new, unique Path name for a persisted media file, and create the parent directory
|
|
75
|
-
for the new Path if it does not already exist. The Path will reside in
|
|
76
|
-
the environment's media_dir.
|
|
77
|
-
"""
|
|
78
|
-
id_hex = uuid.uuid4().hex
|
|
79
|
-
parent = self.__base_dir / tbl_id.hex / id_hex[:2] / id_hex[:4]
|
|
80
|
-
parent.mkdir(parents=True, exist_ok=True)
|
|
81
|
-
return parent / f'{tbl_id.hex}_{col_id}_{tbl_version}_{id_hex}{ext or ""}'
|
|
82
|
-
|
|
83
|
-
def _prepare_media_path(self, col: Column, ext: Optional[str] = None) -> Path:
|
|
84
|
-
"""
|
|
85
|
-
Construct a new, unique Path name for a persisted media file, and create the parent directory
|
|
86
|
-
for the new Path if it does not already exist. The Path will reside in
|
|
87
|
-
the environment's media_dir.
|
|
88
|
-
"""
|
|
89
|
-
assert col.tbl is not None, 'Column must be associated with a table'
|
|
90
|
-
return self._prepare_media_path_raw(col.tbl.id, col.id, col.tbl.version, ext)
|
|
91
|
-
|
|
92
|
-
def resolve_url(self, file_url: Optional[str]) -> Optional[Path]:
|
|
93
|
-
"""Return path if the given url refers to a file managed by this MediaStore, else None.
|
|
94
|
-
|
|
95
|
-
Args:
|
|
96
|
-
file_url: URL to check
|
|
97
|
-
|
|
98
|
-
Returns:
|
|
99
|
-
If the url is a managed file, return a Path() to the file, None, otherwise
|
|
100
|
-
"""
|
|
101
|
-
if file_url is None:
|
|
102
|
-
return None
|
|
103
|
-
assert isinstance(file_url, str), type(file_url)
|
|
104
|
-
parsed = urllib.parse.urlparse(file_url)
|
|
105
|
-
# We should never be passed a local file path here. The "len > 1" ensures that Windows
|
|
106
|
-
# file paths aren't mistaken for URLs with a single-character scheme.
|
|
107
|
-
assert len(parsed.scheme) > 1, file_url
|
|
108
|
-
if parsed.scheme != 'file':
|
|
109
|
-
# remote url
|
|
110
|
-
return None
|
|
111
|
-
src_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
|
|
112
|
-
if not src_path.startswith(str(self.__base_dir)):
|
|
113
|
-
# not a tmp file
|
|
114
|
-
return None
|
|
115
|
-
return Path(src_path)
|
|
116
|
-
|
|
117
|
-
def relocate_local_media_file(self, src_path: Path, col: Column) -> str:
|
|
118
|
-
"""Relocate a local file to a MediaStore, and return its new URL"""
|
|
119
|
-
dest_path = self._prepare_media_path(col, ext=src_path.suffix)
|
|
120
|
-
src_path.rename(dest_path)
|
|
121
|
-
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
122
|
-
_logger.debug(f'Media Storage: moved {src_path} to {new_file_url}')
|
|
123
|
-
return new_file_url
|
|
124
|
-
|
|
125
|
-
def copy_local_media_file(self, src_path: Path, col: Column) -> str:
|
|
126
|
-
"""Copy a local file to a MediaStore, and return its new URL"""
|
|
127
|
-
dest_path = self._prepare_media_path(col, ext=src_path.suffix)
|
|
128
|
-
shutil.copy2(src_path, dest_path)
|
|
129
|
-
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
130
|
-
_logger.debug(f'Media Storage: copied {src_path} to {new_file_url}')
|
|
131
|
-
return new_file_url
|
|
132
|
-
|
|
133
|
-
def save_media_object(self, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
|
|
134
|
-
"""Save a media data object to a file in a MediaStore
|
|
135
|
-
Returns:
|
|
136
|
-
dest_path: Path to the saved media file
|
|
137
|
-
url: URL of the saved media file
|
|
138
|
-
"""
|
|
139
|
-
assert col.col_type.is_media_type(), f'MediaStore: request to store non media_type Column {col.name}'
|
|
140
|
-
dest_path = self._prepare_media_path(col)
|
|
141
|
-
if isinstance(data, bytes):
|
|
142
|
-
dest_path = self._save_binary_media_file(data, dest_path, format)
|
|
143
|
-
elif isinstance(data, PIL.Image.Image):
|
|
144
|
-
dest_path = self._save_pil_image_file(data, dest_path, format)
|
|
145
|
-
else:
|
|
146
|
-
raise ValueError(f'Unsupported media object type: {type(data)}')
|
|
147
|
-
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
148
|
-
return dest_path, new_file_url
|
|
149
|
-
|
|
150
|
-
def delete(self, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
|
|
151
|
-
"""Delete all files belonging to tbl_id. If tbl_version is not None, delete
|
|
152
|
-
only those files belonging to the specified tbl_version."""
|
|
153
|
-
assert tbl_id is not None
|
|
154
|
-
if tbl_version is None:
|
|
155
|
-
# Remove the entire folder for this table id.
|
|
156
|
-
path = self.__base_dir / tbl_id.hex
|
|
157
|
-
if path.exists():
|
|
158
|
-
shutil.rmtree(path)
|
|
159
|
-
else:
|
|
160
|
-
# Remove only the elements for the specified tbl_version.
|
|
161
|
-
paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True)
|
|
162
|
-
for p in paths:
|
|
163
|
-
os.remove(p)
|
|
164
|
-
|
|
165
|
-
def count(self, tbl_id: Optional[UUID]) -> int:
|
|
166
|
-
"""
|
|
167
|
-
Return number of files for given tbl_id.
|
|
168
|
-
"""
|
|
169
|
-
if tbl_id is None:
|
|
170
|
-
paths = glob.glob(str(self.__base_dir / '*'), recursive=True)
|
|
171
|
-
else:
|
|
172
|
-
paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
|
|
173
|
-
return len(paths)
|
|
174
|
-
|
|
175
|
-
def stats(self) -> list[tuple[UUID, int, int, int]]:
|
|
176
|
-
paths = glob.glob(str(self.__base_dir) + '/**', recursive=True)
|
|
177
|
-
# key: (tbl_id, col_id), value: (num_files, size)
|
|
178
|
-
d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
|
|
179
|
-
for p in paths:
|
|
180
|
-
if not os.path.isdir(p):
|
|
181
|
-
matched = re.match(self.pattern, Path(p).name)
|
|
182
|
-
assert matched is not None
|
|
183
|
-
tbl_id, col_id = UUID(hex=matched[1]), int(matched[2])
|
|
184
|
-
file_info = os.stat(p)
|
|
185
|
-
t = d[tbl_id, col_id]
|
|
186
|
-
t[0] += 1
|
|
187
|
-
t[1] += file_info.st_size
|
|
188
|
-
result = [(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()]
|
|
189
|
-
result.sort(key=lambda e: e[3], reverse=True)
|
|
190
|
-
return result
|
|
191
|
-
|
|
192
|
-
def clear(self) -> None:
|
|
193
|
-
"""Clear all files from the media store."""
|
|
194
|
-
assert self.__base_dir.exists()
|
|
195
|
-
shutil.rmtree(self.__base_dir)
|
|
196
|
-
self.__base_dir.mkdir()
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
class TempStore:
|
|
200
|
-
"""
|
|
201
|
-
A temporary store for files of data that are not yet persisted to their destination(s).
|
|
202
|
-
A destination is typically either a MediaStore (local persisted files) or a cloud object store.
|
|
203
|
-
|
|
204
|
-
The TempStore class has no internal state. It provides functionality to manage temporary files
|
|
205
|
-
in the env.Env.get().tmp_dir directory.
|
|
206
|
-
It reuses some of the MediaStore functionality to create unique file names and save objects.
|
|
207
|
-
"""
|
|
208
|
-
|
|
209
|
-
@classmethod
|
|
210
|
-
def _tmp_dir(cls) -> Path:
|
|
211
|
-
"""Returns the path to the temporary directory where files are stored."""
|
|
212
|
-
from pixeltable import env
|
|
213
|
-
|
|
214
|
-
return env.Env.get().tmp_dir
|
|
215
|
-
|
|
216
|
-
@classmethod
|
|
217
|
-
def count(cls, tbl_id: Optional[UUID] = None) -> int:
|
|
218
|
-
return MediaStore(cls._tmp_dir()).count(tbl_id)
|
|
219
|
-
|
|
220
|
-
@classmethod
|
|
221
|
-
def resolve_url(cls, file_url: Optional[str]) -> Optional[Path]:
|
|
222
|
-
return MediaStore(cls._tmp_dir()).resolve_url(file_url)
|
|
223
|
-
|
|
224
|
-
@classmethod
|
|
225
|
-
def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
|
|
226
|
-
return MediaStore(cls._tmp_dir()).save_media_object(data, col, format)
|
|
227
|
-
|
|
228
|
-
@classmethod
|
|
229
|
-
def delete_media_file(cls, obj_path: Path) -> None:
|
|
230
|
-
"""Delete a media object from the temporary store."""
|
|
231
|
-
assert obj_path is not None, 'Object path must be provided'
|
|
232
|
-
assert obj_path.exists(), f'Object path does not exist: {obj_path}'
|
|
233
|
-
assert cls.resolve_url(str(obj_path)) is not None, f'Object path is not a valid media store path: {obj_path}'
|
|
234
|
-
obj_path.unlink()
|
|
235
|
-
|
|
236
|
-
@classmethod
|
|
237
|
-
def create_path(cls, tbl_id: Optional[UUID] = None, extension: str = '') -> Path:
|
|
238
|
-
"""Return a new, unique Path located in the temporary store.
|
|
239
|
-
If tbl_id is provided, the path name will be similar to a MediaStore path based on the tbl_id.
|
|
240
|
-
If tbl_id is None, a random UUID will be used to create the path."""
|
|
241
|
-
if tbl_id is not None:
|
|
242
|
-
return MediaStore(cls._tmp_dir())._prepare_media_path_raw(tbl_id, 0, 0, extension)
|
|
243
|
-
return cls._tmp_dir() / f'{uuid.uuid4()}{extension}'
|
|
244
|
-
|
|
245
|
-
@classmethod
|
|
246
|
-
def clear(cls) -> None:
|
|
247
|
-
"""Clear all files from the temporary store."""
|
|
248
|
-
MediaStore(cls._tmp_dir()).clear()
|
pixeltable/utils/s3.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
from typing import Any
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def get_client(**kwargs: Any) -> Any:
|
|
5
|
-
import boto3
|
|
6
|
-
import botocore
|
|
7
|
-
|
|
8
|
-
try:
|
|
9
|
-
boto3.Session().get_credentials().get_frozen_credentials()
|
|
10
|
-
config = botocore.config.Config(**kwargs)
|
|
11
|
-
return boto3.client('s3', config=config) # credentials are available
|
|
12
|
-
except AttributeError:
|
|
13
|
-
# No credentials available, use unsigned mode
|
|
14
|
-
config_args = kwargs.copy()
|
|
15
|
-
config_args['signature_version'] = botocore.UNSIGNED
|
|
16
|
-
config = botocore.config.Config(**config_args)
|
|
17
|
-
return boto3.client('s3', config=config)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|