pixeltable 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +106 -71
- pixeltable/catalog/path.py +59 -20
- pixeltable/catalog/schema_object.py +1 -0
- pixeltable/catalog/table.py +6 -0
- pixeltable/catalog/table_version.py +2 -1
- pixeltable/catalog/view.py +21 -10
- pixeltable/config.py +12 -4
- pixeltable/dataframe.py +57 -1
- pixeltable/env.py +25 -13
- pixeltable/exec/aggregation_node.py +1 -1
- pixeltable/exec/cache_prefetch_node.py +2 -6
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +10 -53
- pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
- pixeltable/exec/in_memory_data_node.py +13 -11
- pixeltable/exec/sql_node.py +6 -7
- pixeltable/exprs/data_row.py +13 -13
- pixeltable/exprs/row_builder.py +16 -4
- pixeltable/exprs/string_op.py +1 -1
- pixeltable/func/expr_template_function.py +1 -4
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/openai.py +8 -4
- pixeltable/functions/timestamp.py +6 -6
- pixeltable/globals.py +14 -10
- pixeltable/metadata/schema.py +1 -1
- pixeltable/plan.py +5 -14
- pixeltable/share/packager.py +13 -13
- pixeltable/store.py +9 -6
- pixeltable/type_system.py +2 -1
- pixeltable/utils/filecache.py +1 -1
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/media_store.py +84 -39
- {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/METADATA +1 -1
- {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/RECORD +39 -39
- {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/entry_points.txt +0 -0
|
@@ -237,12 +237,12 @@ def _(
|
|
|
237
237
|
microsecond: sql.ColumnElement = _SQL_ZERO,
|
|
238
238
|
) -> sql.ColumnElement:
|
|
239
239
|
return sql.func.make_timestamptz(
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
240
|
+
year.cast(sql.Integer),
|
|
241
|
+
month.cast(sql.Integer),
|
|
242
|
+
day.cast(sql.Integer),
|
|
243
|
+
hour.cast(sql.Integer),
|
|
244
|
+
minute.cast(sql.Integer),
|
|
245
|
+
(second + microsecond / 1000000.0).cast(sql.Float),
|
|
246
246
|
)
|
|
247
247
|
|
|
248
248
|
|
pixeltable/globals.py
CHANGED
|
@@ -146,7 +146,7 @@ def create_table(
|
|
|
146
146
|
if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
|
|
147
147
|
raise excs.Error('`schema` must be a non-empty dictionary')
|
|
148
148
|
|
|
149
|
-
path_obj = catalog.Path(path)
|
|
149
|
+
path_obj = catalog.Path.parse(path)
|
|
150
150
|
if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
151
151
|
media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
|
|
152
152
|
primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
|
|
@@ -284,7 +284,7 @@ def create_view(
|
|
|
284
284
|
raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
|
|
285
285
|
assert isinstance(base, (catalog.Table, DataFrame))
|
|
286
286
|
|
|
287
|
-
path_obj = catalog.Path(path)
|
|
287
|
+
path_obj = catalog.Path.parse(path)
|
|
288
288
|
if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
289
289
|
media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
|
|
290
290
|
|
|
@@ -445,8 +445,12 @@ def get_table(path: str) -> catalog.Table:
|
|
|
445
445
|
Handles to views and snapshots are retrieved in the same way:
|
|
446
446
|
|
|
447
447
|
>>> tbl = pxt.get_table('my_snapshot')
|
|
448
|
+
|
|
449
|
+
Get a handle to a specific version of a table:
|
|
450
|
+
|
|
451
|
+
>>> tbl = pxt.get_table('my_table:722')
|
|
448
452
|
"""
|
|
449
|
-
path_obj = catalog.Path(path)
|
|
453
|
+
path_obj = catalog.Path.parse(path, allow_versioned_path=True)
|
|
450
454
|
tbl = Catalog.get().get_table(path_obj)
|
|
451
455
|
return tbl
|
|
452
456
|
|
|
@@ -472,7 +476,7 @@ def move(path: str, new_path: str) -> None:
|
|
|
472
476
|
"""
|
|
473
477
|
if path == new_path:
|
|
474
478
|
raise excs.Error('move(): source and destination cannot be identical')
|
|
475
|
-
path_obj, new_path_obj = catalog.Path(path), catalog.Path(new_path)
|
|
479
|
+
path_obj, new_path_obj = catalog.Path.parse(path), catalog.Path.parse(new_path)
|
|
476
480
|
if path_obj.is_ancestor(new_path_obj):
|
|
477
481
|
raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
|
|
478
482
|
cat = Catalog.get()
|
|
@@ -525,7 +529,7 @@ def drop_table(
|
|
|
525
529
|
assert isinstance(table, str)
|
|
526
530
|
tbl_path = table
|
|
527
531
|
|
|
528
|
-
path_obj = catalog.Path(tbl_path)
|
|
532
|
+
path_obj = catalog.Path.parse(tbl_path)
|
|
529
533
|
if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
530
534
|
Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
|
|
531
535
|
|
|
@@ -557,7 +561,7 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
|
557
561
|
|
|
558
562
|
|
|
559
563
|
def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths: bool = False) -> list[str]:
|
|
560
|
-
path_obj = catalog.Path(dir_path,
|
|
564
|
+
path_obj = catalog.Path.parse(dir_path, allow_empty_path=True, allow_system_path=allow_system_paths)
|
|
561
565
|
contents = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
|
|
562
566
|
return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
|
|
563
567
|
|
|
@@ -609,7 +613,7 @@ def create_dir(
|
|
|
609
613
|
|
|
610
614
|
>>> pxt.create_dir('parent1.parent2.sub_dir', parents=True)
|
|
611
615
|
"""
|
|
612
|
-
path_obj = catalog.Path(path)
|
|
616
|
+
path_obj = catalog.Path.parse(path)
|
|
613
617
|
if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
614
618
|
return Catalog.get().create_dir(path_obj, if_exists=if_exists_, parents=parents)
|
|
615
619
|
|
|
@@ -651,7 +655,7 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
|
|
|
651
655
|
|
|
652
656
|
>>> pxt.drop_dir('my_dir', force=True)
|
|
653
657
|
"""
|
|
654
|
-
path_obj = catalog.Path(path) # validate format
|
|
658
|
+
path_obj = catalog.Path.parse(path) # validate format
|
|
655
659
|
if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
656
660
|
Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
|
|
657
661
|
|
|
@@ -670,7 +674,7 @@ def ls(path: str = '') -> pd.DataFrame:
|
|
|
670
674
|
from pixeltable.metadata import schema
|
|
671
675
|
|
|
672
676
|
cat = Catalog.get()
|
|
673
|
-
path_obj = catalog.Path(path,
|
|
677
|
+
path_obj = catalog.Path.parse(path, allow_empty_path=True)
|
|
674
678
|
dir_entries = cat.get_dir_contents(path_obj)
|
|
675
679
|
|
|
676
680
|
@retry_loop(for_write=False)
|
|
@@ -759,7 +763,7 @@ def list_dirs(path: str = '', recursive: bool = True) -> list[str]:
|
|
|
759
763
|
>>> cl.list_dirs('my_dir', recursive=True)
|
|
760
764
|
['my_dir', 'my_dir.sub_dir1']
|
|
761
765
|
"""
|
|
762
|
-
path_obj = catalog.Path(path,
|
|
766
|
+
path_obj = catalog.Path.parse(path, allow_empty_path=True) # validate format
|
|
763
767
|
cat = Catalog.get()
|
|
764
768
|
contents = cat.get_dir_contents(path_obj, recursive=recursive)
|
|
765
769
|
return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Dir)]
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -24,7 +24,7 @@ def md_from_dict(data_class_type: type[T], data: Any) -> T:
|
|
|
24
24
|
"""Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
|
|
25
25
|
if dataclasses.is_dataclass(data_class_type):
|
|
26
26
|
fieldtypes = get_type_hints(data_class_type)
|
|
27
|
-
return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
|
|
27
|
+
return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
|
|
28
28
|
|
|
29
29
|
origin = typing.get_origin(data_class_type)
|
|
30
30
|
if origin is not None:
|
pixeltable/plan.py
CHANGED
|
@@ -385,14 +385,7 @@ class Planner:
|
|
|
385
385
|
TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
|
|
386
386
|
)
|
|
387
387
|
|
|
388
|
-
|
|
389
|
-
exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
|
|
390
|
-
for col_ref in row_builder.input_exprs
|
|
391
|
-
if isinstance(col_ref, exprs.ColumnRef) and col_ref.col_type.is_media_type()
|
|
392
|
-
]
|
|
393
|
-
if len(media_input_col_info) > 0:
|
|
394
|
-
# prefetch external files for all input column refs
|
|
395
|
-
plan = exec.CachePrefetchNode(tbl.id, media_input_col_info, input=plan)
|
|
388
|
+
plan = cls._insert_prefetch_node(tbl.id, row_builder.input_exprs, input_node=plan)
|
|
396
389
|
|
|
397
390
|
computed_exprs = row_builder.output_exprs - row_builder.input_exprs
|
|
398
391
|
if len(computed_exprs) > 0:
|
|
@@ -789,15 +782,13 @@ class Planner:
|
|
|
789
782
|
|
|
790
783
|
@classmethod
|
|
791
784
|
def _insert_prefetch_node(
|
|
792
|
-
cls, tbl_id: UUID,
|
|
785
|
+
cls, tbl_id: UUID, expressions: Iterable[exprs.Expr], input_node: exec.ExecNode
|
|
793
786
|
) -> exec.ExecNode:
|
|
794
|
-
"""
|
|
787
|
+
"""Return a CachePrefetchNode if needed, otherwise return input"""
|
|
795
788
|
# we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
|
|
796
789
|
# of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
|
|
797
790
|
# aren't explicitly captured as dependencies
|
|
798
|
-
media_col_refs = [
|
|
799
|
-
e for e in list(row_builder.unique_exprs) if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()
|
|
800
|
-
]
|
|
791
|
+
media_col_refs = [e for e in expressions if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()]
|
|
801
792
|
if len(media_col_refs) == 0:
|
|
802
793
|
return input_node
|
|
803
794
|
# we need to prefetch external files for media column types
|
|
@@ -967,7 +958,7 @@ class Planner:
|
|
|
967
958
|
stratify_exprs=analyzer.stratify_exprs,
|
|
968
959
|
)
|
|
969
960
|
|
|
970
|
-
plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
|
|
961
|
+
plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder.unique_exprs, plan)
|
|
971
962
|
|
|
972
963
|
if analyzer.group_by_clause is not None:
|
|
973
964
|
# we're doing grouping aggregation; the input of the AggregateNode are the grouping exprs plus the
|
pixeltable/share/packager.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import datetime
|
|
3
3
|
import io
|
|
4
|
-
import itertools
|
|
5
4
|
import json
|
|
6
5
|
import logging
|
|
7
6
|
import tarfile
|
|
@@ -237,8 +236,7 @@ class TablePackager:
|
|
|
237
236
|
- Videos are replaced by their first frame and resized as above
|
|
238
237
|
- Documents are replaced by a thumbnail as a base64-encoded webp
|
|
239
238
|
"""
|
|
240
|
-
|
|
241
|
-
preview_cols = dict(itertools.islice(self.table._get_schema().items(), 0, 8))
|
|
239
|
+
preview_cols = self.table._get_schema()
|
|
242
240
|
select_list = [self.table[col_name] for col_name in preview_cols]
|
|
243
241
|
# First 5 rows
|
|
244
242
|
rows = list(self.table.select(*select_list).head(n=5))
|
|
@@ -369,7 +367,7 @@ class TableRestorer:
|
|
|
369
367
|
with cat.begin_xact(for_write=True):
|
|
370
368
|
# Create (or update) the replica table and its ancestors, along with TableVersion instances for any
|
|
371
369
|
# versions that have not been seen before.
|
|
372
|
-
cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
|
|
370
|
+
cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
|
|
373
371
|
|
|
374
372
|
# Now we need to load data for replica_tbl and its ancestors, except that we skip
|
|
375
373
|
# replica_tbl itself if it's a pure snapshot.
|
|
@@ -572,16 +570,18 @@ class TableRestorer:
|
|
|
572
570
|
for col_name in pydict:
|
|
573
571
|
assert col_name in tv.store_tbl.sa_tbl.columns
|
|
574
572
|
sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
|
|
575
|
-
|
|
573
|
+
media_cols: dict[str, catalog.Column] = {}
|
|
576
574
|
for col in tv.cols:
|
|
577
575
|
if col.is_stored and col.col_type.is_media_type():
|
|
578
|
-
|
|
576
|
+
assert tv.id == col.tbl.id
|
|
577
|
+
assert tv.version == col.tbl.version
|
|
578
|
+
media_cols[col.store_name()] = col
|
|
579
579
|
|
|
580
580
|
row_count = len(next(iter(pydict.values())))
|
|
581
581
|
rows: list[dict[str, Any]] = []
|
|
582
582
|
for i in range(row_count):
|
|
583
583
|
row = {
|
|
584
|
-
col_name: self.__from_pa_value(
|
|
584
|
+
col_name: self.__from_pa_value(col_vals[i], sql_types[col_name], media_cols.get(col_name))
|
|
585
585
|
for col_name, col_vals in pydict.items()
|
|
586
586
|
}
|
|
587
587
|
rows.append(row)
|
|
@@ -589,19 +589,19 @@ class TableRestorer:
|
|
|
589
589
|
return rows
|
|
590
590
|
|
|
591
591
|
def __from_pa_value(
|
|
592
|
-
self,
|
|
592
|
+
self, val: Any, sql_type: sql.types.TypeEngine[Any], media_col: Optional[catalog.Column]
|
|
593
593
|
) -> Any:
|
|
594
594
|
if val is None:
|
|
595
595
|
return None
|
|
596
596
|
if isinstance(sql_type, sql.JSON):
|
|
597
597
|
return json.loads(val)
|
|
598
|
-
if
|
|
599
|
-
|
|
600
|
-
return self.__relocate_media_file(tv, media_col_id, val)
|
|
598
|
+
if media_col is not None:
|
|
599
|
+
return self.__relocate_media_file(media_col, val)
|
|
601
600
|
return val
|
|
602
601
|
|
|
603
|
-
def __relocate_media_file(self,
|
|
602
|
+
def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
|
|
604
603
|
# If this is a pxtmedia:// URL, relocate it
|
|
604
|
+
assert isinstance(url, str)
|
|
605
605
|
parsed_url = urllib.parse.urlparse(url)
|
|
606
606
|
assert parsed_url.scheme != 'file' # These should all have been converted to pxtmedia:// URLs
|
|
607
607
|
if parsed_url.scheme == 'pxtmedia':
|
|
@@ -610,7 +610,7 @@ class TableRestorer:
|
|
|
610
610
|
# in self.media_files.
|
|
611
611
|
src_path = self.tmp_dir / 'media' / parsed_url.netloc
|
|
612
612
|
# Move the file to the media store and update the URL.
|
|
613
|
-
self.media_files[url] = MediaStore.relocate_local_media_file(src_path,
|
|
613
|
+
self.media_files[url] = MediaStore.relocate_local_media_file(src_path, media_col)
|
|
614
614
|
return self.media_files[url]
|
|
615
615
|
# For any type of URL other than a local file, just return the URL as-is.
|
|
616
616
|
return url
|
pixeltable/store.py
CHANGED
|
@@ -123,15 +123,20 @@ class StoreBase:
|
|
|
123
123
|
def _storage_name(self) -> str:
|
|
124
124
|
"""Return the name of the data store table"""
|
|
125
125
|
|
|
126
|
-
def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column
|
|
127
|
-
|
|
126
|
+
def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column) -> str:
|
|
127
|
+
src_path = MediaStore.resolve_tmp_url(file_url)
|
|
128
|
+
if src_path is None:
|
|
129
|
+
return file_url
|
|
130
|
+
assert col.tbl.id == self.tbl_version.id # Ensure the column belongs to the same table as this store
|
|
131
|
+
new_file_url = MediaStore.relocate_local_media_file(src_path, col)
|
|
132
|
+
return new_file_url
|
|
128
133
|
|
|
129
134
|
def _move_tmp_media_files(
|
|
130
135
|
self, table_row: list[Any], media_cols_by_sql_idx: dict[int, catalog.Column], v_min: int
|
|
131
136
|
) -> None:
|
|
132
137
|
"""Move tmp media files that we generated to a permanent location"""
|
|
133
138
|
for n, col in media_cols_by_sql_idx.items():
|
|
134
|
-
table_row[n] = self._move_tmp_media_file(table_row[n], col
|
|
139
|
+
table_row[n] = self._move_tmp_media_file(table_row[n], col)
|
|
135
140
|
|
|
136
141
|
def count(self) -> int:
|
|
137
142
|
"""Return the number of rows visible in self.tbl_version"""
|
|
@@ -259,9 +264,7 @@ class StoreBase:
|
|
|
259
264
|
raise excs.Error(f'Error while evaluating computed column {col.name!r}:\n{exc}') from exc
|
|
260
265
|
table_row, num_row_exc = row_builder.create_table_row(row, None, row.pk)
|
|
261
266
|
if col.col_type.is_media_type():
|
|
262
|
-
table_row[tmp_val_col_sql_idx] = self._move_tmp_media_file(
|
|
263
|
-
table_row[tmp_val_col_sql_idx], col, row.pk[-1]
|
|
264
|
-
)
|
|
267
|
+
table_row[tmp_val_col_sql_idx] = self._move_tmp_media_file(table_row[tmp_val_col_sql_idx], col)
|
|
265
268
|
num_excs += num_row_exc
|
|
266
269
|
batch_table_rows.append(tuple(table_row))
|
|
267
270
|
|
pixeltable/type_system.py
CHANGED
|
@@ -5,6 +5,7 @@ import datetime
|
|
|
5
5
|
import enum
|
|
6
6
|
import io
|
|
7
7
|
import json
|
|
8
|
+
import types
|
|
8
9
|
import typing
|
|
9
10
|
import urllib.parse
|
|
10
11
|
import urllib.request
|
|
@@ -307,7 +308,7 @@ class ColumnType:
|
|
|
307
308
|
"""
|
|
308
309
|
origin = typing.get_origin(t)
|
|
309
310
|
type_args = typing.get_args(t)
|
|
310
|
-
if origin
|
|
311
|
+
if origin in (typing.Union, types.UnionType):
|
|
311
312
|
# Check if `t` has the form Optional[T].
|
|
312
313
|
if len(type_args) == 2 and type(None) in type_args:
|
|
313
314
|
# `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
|
pixeltable/utils/filecache.py
CHANGED
|
@@ -214,7 +214,7 @@ class FileCache:
|
|
|
214
214
|
new_path = entry.path
|
|
215
215
|
os.rename(str(path), str(new_path))
|
|
216
216
|
new_path.touch(exist_ok=True)
|
|
217
|
-
_logger.debug(f'
|
|
217
|
+
_logger.debug(f'FileCache: cached url {url} with file name {new_path}')
|
|
218
218
|
return new_path
|
|
219
219
|
|
|
220
220
|
def ensure_capacity(self, size: int) -> None:
|
pixeltable/utils/http_server.py
CHANGED
|
@@ -2,7 +2,7 @@ import http
|
|
|
2
2
|
import http.server
|
|
3
3
|
import logging
|
|
4
4
|
import pathlib
|
|
5
|
-
import urllib
|
|
5
|
+
import urllib.request
|
|
6
6
|
from typing import Any
|
|
7
7
|
|
|
8
8
|
_logger = logging.getLogger('pixeltable.http.server')
|
|
@@ -36,8 +36,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
36
36
|
path = path.split('?', 1)[0]
|
|
37
37
|
path = path.split('#', 1)[0]
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
return str(path)
|
|
39
|
+
return str(pathlib.Path(urllib.request.url2pathname(path)))
|
|
41
40
|
|
|
42
41
|
def log_message(self, format: str, *args: Any) -> None:
|
|
43
42
|
"""override logging to stderr in http.server.BaseHTTPRequestHandler"""
|
pixeltable/utils/media_store.py
CHANGED
|
@@ -1,102 +1,147 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import glob
|
|
2
4
|
import os
|
|
3
5
|
import re
|
|
4
6
|
import shutil
|
|
5
|
-
import urllib
|
|
7
|
+
import urllib.parse
|
|
8
|
+
import urllib.request
|
|
6
9
|
import uuid
|
|
7
10
|
from collections import defaultdict
|
|
8
11
|
from pathlib import Path
|
|
9
|
-
from typing import Optional
|
|
12
|
+
from typing import TYPE_CHECKING, Optional
|
|
10
13
|
from uuid import UUID
|
|
11
14
|
|
|
12
|
-
|
|
15
|
+
import PIL.Image
|
|
16
|
+
|
|
17
|
+
from pixeltable import env
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from pixeltable.catalog import Column
|
|
13
21
|
|
|
14
22
|
|
|
15
23
|
class MediaStore:
|
|
16
24
|
"""
|
|
17
25
|
Utilities to manage media files stored in Env.media_dir
|
|
18
26
|
|
|
19
|
-
Media file names are a composite of: table id, column id,
|
|
20
|
-
the table id/column id/
|
|
27
|
+
Media file names are a composite of: table id, column id, tbl_version, new uuid:
|
|
28
|
+
the table id/column id/tbl_version are redundant but useful for identifying all files for a table
|
|
21
29
|
or all files created for a particular version of a table
|
|
22
30
|
"""
|
|
23
31
|
|
|
24
32
|
pattern = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
|
|
25
33
|
|
|
26
34
|
@classmethod
|
|
27
|
-
def
|
|
35
|
+
def _media_dir(cls) -> Path:
|
|
36
|
+
"""Returns the media directory path."""
|
|
37
|
+
return env.Env.get().media_dir
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def _tmp_dir(cls) -> Path:
|
|
41
|
+
"""Returns the temporary directory path."""
|
|
42
|
+
return env.Env.get().tmp_dir
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def _prepare_media_path(cls, col: Column, ext: Optional[str] = None) -> Path:
|
|
28
46
|
"""
|
|
29
47
|
Construct a new, unique Path name for a persisted media file, and create the parent directory
|
|
30
48
|
for the new Path if it does not already exist. The Path will reside in
|
|
31
49
|
the environment's media_dir.
|
|
32
50
|
"""
|
|
33
51
|
id_hex = uuid.uuid4().hex
|
|
34
|
-
parent =
|
|
52
|
+
parent = cls._media_dir() / col.tbl.id.hex / id_hex[:2] / id_hex[:4]
|
|
35
53
|
parent.mkdir(parents=True, exist_ok=True)
|
|
36
|
-
return parent / f'{
|
|
54
|
+
return parent / f'{col.tbl.id.hex}_{col.id}_{col.tbl.version}_{id_hex}{ext or ""}'
|
|
37
55
|
|
|
38
56
|
@classmethod
|
|
39
|
-
def
|
|
40
|
-
"""
|
|
41
|
-
If it is not a tmp file in the tmp_dir, return the original url.
|
|
57
|
+
def resolve_tmp_url(cls, file_url: Optional[str]) -> Optional[Path]:
|
|
58
|
+
"""Return path if the given url is a tmp file.
|
|
42
59
|
|
|
43
60
|
Args:
|
|
44
|
-
file_url: URL of the tmp media file to
|
|
45
|
-
tbl_id: Table ID to associate with the media file
|
|
46
|
-
col_id: Column ID to associate with the media file
|
|
47
|
-
v_min: Version number to associate with the media file
|
|
61
|
+
file_url: URL of the tmp media file to check
|
|
48
62
|
|
|
49
63
|
Returns:
|
|
50
|
-
|
|
64
|
+
If the file_url is a tmp file, return a Path() to the tmp file, None, otherwise
|
|
51
65
|
"""
|
|
52
66
|
if file_url is None:
|
|
53
67
|
return None
|
|
54
68
|
assert isinstance(file_url, str), type(file_url)
|
|
55
|
-
pxt_tmp_dir = str(Env.get().tmp_dir)
|
|
56
69
|
parsed = urllib.parse.urlparse(file_url)
|
|
57
70
|
# We should never be passed a local file path here. The "len > 1" ensures that Windows
|
|
58
71
|
# file paths aren't mistaken for URLs with a single-character scheme.
|
|
59
72
|
assert len(parsed.scheme) > 1, file_url
|
|
60
73
|
if parsed.scheme != 'file':
|
|
61
74
|
# remote url
|
|
62
|
-
return
|
|
63
|
-
|
|
64
|
-
|
|
75
|
+
return None
|
|
76
|
+
src_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
|
|
77
|
+
pxt_tmp_dir = str(cls._tmp_dir())
|
|
78
|
+
if not src_path.startswith(pxt_tmp_dir):
|
|
65
79
|
# not a tmp file
|
|
66
|
-
return
|
|
67
|
-
|
|
68
|
-
return new_file_url
|
|
80
|
+
return None
|
|
81
|
+
return Path(src_path)
|
|
69
82
|
|
|
70
83
|
@classmethod
|
|
71
|
-
def relocate_local_media_file(cls, src_path: Path,
|
|
72
|
-
|
|
84
|
+
def relocate_local_media_file(cls, src_path: Path, col: Column) -> str:
|
|
85
|
+
"""Relocate a local file to the MediaStore, and return its new URL"""
|
|
86
|
+
dest_path = cls._prepare_media_path(col, ext=src_path.suffix)
|
|
73
87
|
src_path.rename(dest_path)
|
|
74
88
|
return urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
75
89
|
|
|
76
90
|
@classmethod
|
|
77
|
-
def
|
|
78
|
-
"""Save a media
|
|
91
|
+
def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
|
|
92
|
+
"""Save a media data to a file in the MediaStore
|
|
93
|
+
Returns:
|
|
94
|
+
dest_path: Path to the saved media file
|
|
95
|
+
url: URL of the saved media file
|
|
96
|
+
"""
|
|
97
|
+
assert col.col_type.is_media_type(), f'MediaStore: request to store non media_type Column {col.name}'
|
|
98
|
+
dest_path = cls._prepare_media_path(col)
|
|
99
|
+
if isinstance(data, bytes):
|
|
100
|
+
dest_path = cls._save_binary_media_file(data, dest_path, format)
|
|
101
|
+
elif isinstance(data, PIL.Image.Image):
|
|
102
|
+
dest_path = cls._save_pil_image_file(data, dest_path, format)
|
|
103
|
+
else:
|
|
104
|
+
raise ValueError(f'Unsupported media object type: {type(data)}')
|
|
105
|
+
url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
106
|
+
return dest_path, url
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
|
|
110
|
+
"""Save a media binary data to a file in the MediaStore. format is ignored for binary data."""
|
|
79
111
|
assert isinstance(file_data, bytes)
|
|
80
|
-
|
|
81
|
-
with open(media_path, 'wb') as f:
|
|
112
|
+
with open(dest_path, 'wb') as f:
|
|
82
113
|
f.write(file_data)
|
|
83
114
|
f.flush() # Ensures Python buffers are written to OS
|
|
84
115
|
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
85
|
-
return
|
|
116
|
+
return dest_path
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
|
|
120
|
+
"""Save a PIL Image to a file in the MediaStore with the specified format."""
|
|
121
|
+
if dest_path.suffix != f'.{format}':
|
|
122
|
+
dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
|
|
123
|
+
|
|
124
|
+
with open(dest_path, 'wb') as f:
|
|
125
|
+
image.save(f, format=format)
|
|
126
|
+
f.flush() # Ensures Python buffers are written to OS
|
|
127
|
+
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
128
|
+
return dest_path
|
|
86
129
|
|
|
87
130
|
@classmethod
|
|
88
|
-
def delete(cls, tbl_id: UUID,
|
|
89
|
-
"""Delete all files belonging to tbl_id. If
|
|
90
|
-
only those files belonging to the specified
|
|
131
|
+
def delete(cls, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
|
|
132
|
+
"""Delete all files belonging to tbl_id. If tbl_version is not None, delete
|
|
133
|
+
only those files belonging to the specified tbl_version."""
|
|
91
134
|
assert tbl_id is not None
|
|
92
|
-
if
|
|
135
|
+
if tbl_version is None:
|
|
93
136
|
# Remove the entire folder for this table id.
|
|
94
|
-
path =
|
|
137
|
+
path = cls._media_dir() / tbl_id.hex
|
|
95
138
|
if path.exists():
|
|
96
139
|
shutil.rmtree(path)
|
|
97
140
|
else:
|
|
98
|
-
# Remove only the elements for the specified
|
|
99
|
-
paths = glob.glob(
|
|
141
|
+
# Remove only the elements for the specified tbl_version.
|
|
142
|
+
paths = glob.glob(
|
|
143
|
+
str(cls._media_dir() / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True
|
|
144
|
+
)
|
|
100
145
|
for p in paths:
|
|
101
146
|
os.remove(p)
|
|
102
147
|
|
|
@@ -105,12 +150,12 @@ class MediaStore:
|
|
|
105
150
|
"""
|
|
106
151
|
Return number of files for given tbl_id.
|
|
107
152
|
"""
|
|
108
|
-
paths = glob.glob(str(
|
|
153
|
+
paths = glob.glob(str(cls._media_dir() / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
|
|
109
154
|
return len(paths)
|
|
110
155
|
|
|
111
156
|
@classmethod
|
|
112
157
|
def stats(cls) -> list[tuple[UUID, int, int, int]]:
|
|
113
|
-
paths = glob.glob(str(
|
|
158
|
+
paths = glob.glob(str(cls._media_dir()) + '/**', recursive=True)
|
|
114
159
|
# key: (tbl_id, col_id), value: (num_files, size)
|
|
115
160
|
d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
|
|
116
161
|
for p in paths:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: pixeltable
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.5
|
|
4
4
|
Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai
|