pixeltable 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__init__.py +2 -1
  2. pixeltable/catalog/catalog.py +187 -63
  3. pixeltable/catalog/column.py +24 -20
  4. pixeltable/catalog/table.py +24 -8
  5. pixeltable/catalog/table_metadata.py +1 -0
  6. pixeltable/catalog/table_version.py +16 -34
  7. pixeltable/catalog/update_status.py +12 -0
  8. pixeltable/catalog/view.py +22 -22
  9. pixeltable/config.py +2 -0
  10. pixeltable/dataframe.py +4 -2
  11. pixeltable/env.py +46 -21
  12. pixeltable/exec/__init__.py +1 -0
  13. pixeltable/exec/aggregation_node.py +0 -1
  14. pixeltable/exec/cache_prefetch_node.py +74 -98
  15. pixeltable/exec/data_row_batch.py +2 -18
  16. pixeltable/exec/expr_eval/expr_eval_node.py +11 -0
  17. pixeltable/exec/in_memory_data_node.py +1 -1
  18. pixeltable/exec/object_store_save_node.py +299 -0
  19. pixeltable/exec/sql_node.py +28 -33
  20. pixeltable/exprs/data_row.py +31 -25
  21. pixeltable/exprs/json_path.py +6 -5
  22. pixeltable/exprs/row_builder.py +6 -12
  23. pixeltable/functions/gemini.py +1 -1
  24. pixeltable/functions/openai.py +1 -1
  25. pixeltable/functions/video.py +128 -15
  26. pixeltable/functions/whisperx.py +2 -0
  27. pixeltable/functions/yolox.py +2 -0
  28. pixeltable/globals.py +49 -30
  29. pixeltable/index/embedding_index.py +5 -8
  30. pixeltable/io/__init__.py +1 -0
  31. pixeltable/io/fiftyone.py +1 -1
  32. pixeltable/io/label_studio.py +4 -5
  33. pixeltable/iterators/__init__.py +1 -0
  34. pixeltable/iterators/audio.py +1 -1
  35. pixeltable/iterators/document.py +10 -12
  36. pixeltable/iterators/video.py +1 -1
  37. pixeltable/metadata/schema.py +7 -0
  38. pixeltable/plan.py +26 -1
  39. pixeltable/share/packager.py +8 -2
  40. pixeltable/share/publish.py +3 -10
  41. pixeltable/store.py +1 -1
  42. pixeltable/type_system.py +1 -3
  43. pixeltable/utils/dbms.py +31 -5
  44. pixeltable/utils/gcs_store.py +283 -0
  45. pixeltable/utils/local_store.py +316 -0
  46. pixeltable/utils/object_stores.py +497 -0
  47. pixeltable/utils/pytorch.py +5 -6
  48. pixeltable/utils/s3_store.py +354 -0
  49. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/METADATA +1 -1
  50. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/RECORD +53 -50
  51. pixeltable/utils/media_store.py +0 -248
  52. pixeltable/utils/s3.py +0 -17
  53. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/WHEEL +0 -0
  54. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/entry_points.txt +0 -0
  55. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/licenses/LICENSE +0 -0
@@ -1,248 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import glob
4
- import logging
5
- import os
6
- import re
7
- import shutil
8
- import urllib.parse
9
- import urllib.request
10
- import uuid
11
- from collections import defaultdict
12
- from pathlib import Path
13
- from typing import TYPE_CHECKING, Optional
14
- from uuid import UUID
15
-
16
- import PIL.Image
17
-
18
- from pixeltable import env
19
-
20
- if TYPE_CHECKING:
21
- from pixeltable.catalog import Column
22
-
23
- _logger = logging.getLogger('pixeltable')
24
-
25
-
26
- class MediaStore:
27
- """
28
- Utilities to manage media files stored in a local filesystem directory.
29
-
30
- Media file names are a composite of: table id, column id, tbl_version, new uuid:
31
- the table id/column id/tbl_version are redundant but useful for identifying all files for a table
32
- or all files created for a particular version of a table
33
- """
34
-
35
- pattern = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
36
- __base_dir: Path
37
-
38
- def __init__(self, base_dir: Path):
39
- """Initialize a MediaStore with a base directory."""
40
- assert isinstance(base_dir, Path), 'Base directory must be a Path instance.'
41
- self.__base_dir = base_dir
42
-
43
- @classmethod
44
- def get(cls, base_uri: Optional[Path] = None) -> MediaStore:
45
- """Get a MediaStore instance for the given base URI, or the environment's media_dir if None."""
46
- if base_uri is None:
47
- return MediaStore(env.Env.get().media_dir)
48
- raise NotImplementedError
49
-
50
- @classmethod
51
- def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
52
- """Save a media binary data to a file in a MediaStore. format is ignored for binary data."""
53
- assert isinstance(file_data, bytes)
54
- with open(dest_path, 'wb') as f:
55
- f.write(file_data)
56
- f.flush() # Ensures Python buffers are written to OS
57
- os.fsync(f.fileno()) # Forces OS to write to physical storage
58
- return dest_path
59
-
60
- @classmethod
61
- def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
62
- """Save a PIL Image to a file in a MediaStore with the specified format."""
63
- if dest_path.suffix != f'.{format}':
64
- dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
65
-
66
- with open(dest_path, 'wb') as f:
67
- image.save(f, format=format)
68
- f.flush() # Ensures Python buffers are written to OS
69
- os.fsync(f.fileno()) # Forces OS to write to physical storage
70
- return dest_path
71
-
72
- def _prepare_media_path_raw(self, tbl_id: UUID, col_id: int, tbl_version: int, ext: Optional[str] = None) -> Path:
73
- """
74
- Construct a new, unique Path name for a persisted media file, and create the parent directory
75
- for the new Path if it does not already exist. The Path will reside in
76
- the environment's media_dir.
77
- """
78
- id_hex = uuid.uuid4().hex
79
- parent = self.__base_dir / tbl_id.hex / id_hex[:2] / id_hex[:4]
80
- parent.mkdir(parents=True, exist_ok=True)
81
- return parent / f'{tbl_id.hex}_{col_id}_{tbl_version}_{id_hex}{ext or ""}'
82
-
83
- def _prepare_media_path(self, col: Column, ext: Optional[str] = None) -> Path:
84
- """
85
- Construct a new, unique Path name for a persisted media file, and create the parent directory
86
- for the new Path if it does not already exist. The Path will reside in
87
- the environment's media_dir.
88
- """
89
- assert col.tbl is not None, 'Column must be associated with a table'
90
- return self._prepare_media_path_raw(col.tbl.id, col.id, col.tbl.version, ext)
91
-
92
- def resolve_url(self, file_url: Optional[str]) -> Optional[Path]:
93
- """Return path if the given url refers to a file managed by this MediaStore, else None.
94
-
95
- Args:
96
- file_url: URL to check
97
-
98
- Returns:
99
- If the url is a managed file, return a Path() to the file, None, otherwise
100
- """
101
- if file_url is None:
102
- return None
103
- assert isinstance(file_url, str), type(file_url)
104
- parsed = urllib.parse.urlparse(file_url)
105
- # We should never be passed a local file path here. The "len > 1" ensures that Windows
106
- # file paths aren't mistaken for URLs with a single-character scheme.
107
- assert len(parsed.scheme) > 1, file_url
108
- if parsed.scheme != 'file':
109
- # remote url
110
- return None
111
- src_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
112
- if not src_path.startswith(str(self.__base_dir)):
113
- # not a tmp file
114
- return None
115
- return Path(src_path)
116
-
117
- def relocate_local_media_file(self, src_path: Path, col: Column) -> str:
118
- """Relocate a local file to a MediaStore, and return its new URL"""
119
- dest_path = self._prepare_media_path(col, ext=src_path.suffix)
120
- src_path.rename(dest_path)
121
- new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
122
- _logger.debug(f'Media Storage: moved {src_path} to {new_file_url}')
123
- return new_file_url
124
-
125
- def copy_local_media_file(self, src_path: Path, col: Column) -> str:
126
- """Copy a local file to a MediaStore, and return its new URL"""
127
- dest_path = self._prepare_media_path(col, ext=src_path.suffix)
128
- shutil.copy2(src_path, dest_path)
129
- new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
130
- _logger.debug(f'Media Storage: copied {src_path} to {new_file_url}')
131
- return new_file_url
132
-
133
- def save_media_object(self, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
134
- """Save a media data object to a file in a MediaStore
135
- Returns:
136
- dest_path: Path to the saved media file
137
- url: URL of the saved media file
138
- """
139
- assert col.col_type.is_media_type(), f'MediaStore: request to store non media_type Column {col.name}'
140
- dest_path = self._prepare_media_path(col)
141
- if isinstance(data, bytes):
142
- dest_path = self._save_binary_media_file(data, dest_path, format)
143
- elif isinstance(data, PIL.Image.Image):
144
- dest_path = self._save_pil_image_file(data, dest_path, format)
145
- else:
146
- raise ValueError(f'Unsupported media object type: {type(data)}')
147
- new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
148
- return dest_path, new_file_url
149
-
150
- def delete(self, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
151
- """Delete all files belonging to tbl_id. If tbl_version is not None, delete
152
- only those files belonging to the specified tbl_version."""
153
- assert tbl_id is not None
154
- if tbl_version is None:
155
- # Remove the entire folder for this table id.
156
- path = self.__base_dir / tbl_id.hex
157
- if path.exists():
158
- shutil.rmtree(path)
159
- else:
160
- # Remove only the elements for the specified tbl_version.
161
- paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True)
162
- for p in paths:
163
- os.remove(p)
164
-
165
- def count(self, tbl_id: Optional[UUID]) -> int:
166
- """
167
- Return number of files for given tbl_id.
168
- """
169
- if tbl_id is None:
170
- paths = glob.glob(str(self.__base_dir / '*'), recursive=True)
171
- else:
172
- paths = glob.glob(str(self.__base_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
173
- return len(paths)
174
-
175
- def stats(self) -> list[tuple[UUID, int, int, int]]:
176
- paths = glob.glob(str(self.__base_dir) + '/**', recursive=True)
177
- # key: (tbl_id, col_id), value: (num_files, size)
178
- d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
179
- for p in paths:
180
- if not os.path.isdir(p):
181
- matched = re.match(self.pattern, Path(p).name)
182
- assert matched is not None
183
- tbl_id, col_id = UUID(hex=matched[1]), int(matched[2])
184
- file_info = os.stat(p)
185
- t = d[tbl_id, col_id]
186
- t[0] += 1
187
- t[1] += file_info.st_size
188
- result = [(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()]
189
- result.sort(key=lambda e: e[3], reverse=True)
190
- return result
191
-
192
- def clear(self) -> None:
193
- """Clear all files from the media store."""
194
- assert self.__base_dir.exists()
195
- shutil.rmtree(self.__base_dir)
196
- self.__base_dir.mkdir()
197
-
198
-
199
- class TempStore:
200
- """
201
- A temporary store for files of data that are not yet persisted to their destination(s).
202
- A destination is typically either a MediaStore (local persisted files) or a cloud object store.
203
-
204
- The TempStore class has no internal state. It provides functionality to manage temporary files
205
- in the env.Env.get().tmp_dir directory.
206
- It reuses some of the MediaStore functionality to create unique file names and save objects.
207
- """
208
-
209
- @classmethod
210
- def _tmp_dir(cls) -> Path:
211
- """Returns the path to the temporary directory where files are stored."""
212
- from pixeltable import env
213
-
214
- return env.Env.get().tmp_dir
215
-
216
- @classmethod
217
- def count(cls, tbl_id: Optional[UUID] = None) -> int:
218
- return MediaStore(cls._tmp_dir()).count(tbl_id)
219
-
220
- @classmethod
221
- def resolve_url(cls, file_url: Optional[str]) -> Optional[Path]:
222
- return MediaStore(cls._tmp_dir()).resolve_url(file_url)
223
-
224
- @classmethod
225
- def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
226
- return MediaStore(cls._tmp_dir()).save_media_object(data, col, format)
227
-
228
- @classmethod
229
- def delete_media_file(cls, obj_path: Path) -> None:
230
- """Delete a media object from the temporary store."""
231
- assert obj_path is not None, 'Object path must be provided'
232
- assert obj_path.exists(), f'Object path does not exist: {obj_path}'
233
- assert cls.resolve_url(str(obj_path)) is not None, f'Object path is not a valid media store path: {obj_path}'
234
- obj_path.unlink()
235
-
236
- @classmethod
237
- def create_path(cls, tbl_id: Optional[UUID] = None, extension: str = '') -> Path:
238
- """Return a new, unique Path located in the temporary store.
239
- If tbl_id is provided, the path name will be similar to a MediaStore path based on the tbl_id.
240
- If tbl_id is None, a random UUID will be used to create the path."""
241
- if tbl_id is not None:
242
- return MediaStore(cls._tmp_dir())._prepare_media_path_raw(tbl_id, 0, 0, extension)
243
- return cls._tmp_dir() / f'{uuid.uuid4()}{extension}'
244
-
245
- @classmethod
246
- def clear(cls) -> None:
247
- """Clear all files from the temporary store."""
248
- MediaStore(cls._tmp_dir()).clear()
pixeltable/utils/s3.py DELETED
@@ -1,17 +0,0 @@
1
- from typing import Any
2
-
3
-
4
- def get_client(**kwargs: Any) -> Any:
5
- import boto3
6
- import botocore
7
-
8
- try:
9
- boto3.Session().get_credentials().get_frozen_credentials()
10
- config = botocore.config.Config(**kwargs)
11
- return boto3.client('s3', config=config) # credentials are available
12
- except AttributeError:
13
- # No credentials available, use unsigned mode
14
- config_args = kwargs.copy()
15
- config_args['signature_version'] = botocore.UNSIGNED
16
- config = botocore.config.Config(**config_args)
17
- return boto3.client('s3', config=config)