pixeltable 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (39) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/catalog.py +106 -71
  3. pixeltable/catalog/path.py +59 -20
  4. pixeltable/catalog/schema_object.py +1 -0
  5. pixeltable/catalog/table.py +6 -0
  6. pixeltable/catalog/table_version.py +2 -1
  7. pixeltable/catalog/view.py +21 -10
  8. pixeltable/config.py +12 -4
  9. pixeltable/dataframe.py +57 -1
  10. pixeltable/env.py +25 -13
  11. pixeltable/exec/aggregation_node.py +1 -1
  12. pixeltable/exec/cache_prefetch_node.py +2 -6
  13. pixeltable/exec/component_iteration_node.py +4 -3
  14. pixeltable/exec/data_row_batch.py +10 -53
  15. pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
  16. pixeltable/exec/in_memory_data_node.py +13 -11
  17. pixeltable/exec/sql_node.py +6 -7
  18. pixeltable/exprs/data_row.py +13 -13
  19. pixeltable/exprs/row_builder.py +16 -4
  20. pixeltable/exprs/string_op.py +1 -1
  21. pixeltable/func/expr_template_function.py +1 -4
  22. pixeltable/functions/date.py +1 -1
  23. pixeltable/functions/math.py +1 -1
  24. pixeltable/functions/openai.py +8 -4
  25. pixeltable/functions/timestamp.py +6 -6
  26. pixeltable/globals.py +14 -10
  27. pixeltable/metadata/schema.py +1 -1
  28. pixeltable/plan.py +5 -14
  29. pixeltable/share/packager.py +13 -13
  30. pixeltable/store.py +9 -6
  31. pixeltable/type_system.py +2 -1
  32. pixeltable/utils/filecache.py +1 -1
  33. pixeltable/utils/http_server.py +2 -3
  34. pixeltable/utils/media_store.py +84 -39
  35. {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/METADATA +1 -1
  36. {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/RECORD +39 -39
  37. {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/LICENSE +0 -0
  38. {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/WHEEL +0 -0
  39. {pixeltable-0.4.4.dist-info → pixeltable-0.4.5.dist-info}/entry_points.txt +0 -0
@@ -237,12 +237,12 @@ def _(
237
237
  microsecond: sql.ColumnElement = _SQL_ZERO,
238
238
  ) -> sql.ColumnElement:
239
239
  return sql.func.make_timestamptz(
240
- sql.cast(year, sql.Integer),
241
- sql.cast(month, sql.Integer),
242
- sql.cast(day, sql.Integer),
243
- sql.cast(hour, sql.Integer),
244
- sql.cast(minute, sql.Integer),
245
- sql.cast(second + microsecond / 1000000.0, sql.Float),
240
+ year.cast(sql.Integer),
241
+ month.cast(sql.Integer),
242
+ day.cast(sql.Integer),
243
+ hour.cast(sql.Integer),
244
+ minute.cast(sql.Integer),
245
+ (second + microsecond / 1000000.0).cast(sql.Float),
246
246
  )
247
247
 
248
248
 
pixeltable/globals.py CHANGED
@@ -146,7 +146,7 @@ def create_table(
146
146
  if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
147
147
  raise excs.Error('`schema` must be a non-empty dictionary')
148
148
 
149
- path_obj = catalog.Path(path)
149
+ path_obj = catalog.Path.parse(path)
150
150
  if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
151
151
  media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
152
152
  primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
@@ -284,7 +284,7 @@ def create_view(
284
284
  raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
285
285
  assert isinstance(base, (catalog.Table, DataFrame))
286
286
 
287
- path_obj = catalog.Path(path)
287
+ path_obj = catalog.Path.parse(path)
288
288
  if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
289
289
  media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
290
290
 
@@ -445,8 +445,12 @@ def get_table(path: str) -> catalog.Table:
445
445
  Handles to views and snapshots are retrieved in the same way:
446
446
 
447
447
  >>> tbl = pxt.get_table('my_snapshot')
448
+
449
+ Get a handle to a specific version of a table:
450
+
451
+ >>> tbl = pxt.get_table('my_table:722')
448
452
  """
449
- path_obj = catalog.Path(path)
453
+ path_obj = catalog.Path.parse(path, allow_versioned_path=True)
450
454
  tbl = Catalog.get().get_table(path_obj)
451
455
  return tbl
452
456
 
@@ -472,7 +476,7 @@ def move(path: str, new_path: str) -> None:
472
476
  """
473
477
  if path == new_path:
474
478
  raise excs.Error('move(): source and destination cannot be identical')
475
- path_obj, new_path_obj = catalog.Path(path), catalog.Path(new_path)
479
+ path_obj, new_path_obj = catalog.Path.parse(path), catalog.Path.parse(new_path)
476
480
  if path_obj.is_ancestor(new_path_obj):
477
481
  raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
478
482
  cat = Catalog.get()
@@ -525,7 +529,7 @@ def drop_table(
525
529
  assert isinstance(table, str)
526
530
  tbl_path = table
527
531
 
528
- path_obj = catalog.Path(tbl_path)
532
+ path_obj = catalog.Path.parse(tbl_path)
529
533
  if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
530
534
  Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
531
535
 
@@ -557,7 +561,7 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
557
561
 
558
562
 
559
563
  def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths: bool = False) -> list[str]:
560
- path_obj = catalog.Path(dir_path, empty_is_valid=True, allow_system_paths=allow_system_paths)
564
+ path_obj = catalog.Path.parse(dir_path, allow_empty_path=True, allow_system_path=allow_system_paths)
561
565
  contents = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
562
566
  return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
563
567
 
@@ -609,7 +613,7 @@ def create_dir(
609
613
 
610
614
  >>> pxt.create_dir('parent1.parent2.sub_dir', parents=True)
611
615
  """
612
- path_obj = catalog.Path(path)
616
+ path_obj = catalog.Path.parse(path)
613
617
  if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
614
618
  return Catalog.get().create_dir(path_obj, if_exists=if_exists_, parents=parents)
615
619
 
@@ -651,7 +655,7 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
651
655
 
652
656
  >>> pxt.drop_dir('my_dir', force=True)
653
657
  """
654
- path_obj = catalog.Path(path) # validate format
658
+ path_obj = catalog.Path.parse(path) # validate format
655
659
  if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
656
660
  Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
657
661
 
@@ -670,7 +674,7 @@ def ls(path: str = '') -> pd.DataFrame:
670
674
  from pixeltable.metadata import schema
671
675
 
672
676
  cat = Catalog.get()
673
- path_obj = catalog.Path(path, empty_is_valid=True)
677
+ path_obj = catalog.Path.parse(path, allow_empty_path=True)
674
678
  dir_entries = cat.get_dir_contents(path_obj)
675
679
 
676
680
  @retry_loop(for_write=False)
@@ -759,7 +763,7 @@ def list_dirs(path: str = '', recursive: bool = True) -> list[str]:
759
763
  >>> cl.list_dirs('my_dir', recursive=True)
760
764
  ['my_dir', 'my_dir.sub_dir1']
761
765
  """
762
- path_obj = catalog.Path(path, empty_is_valid=True) # validate format
766
+ path_obj = catalog.Path.parse(path, allow_empty_path=True) # validate format
763
767
  cat = Catalog.get()
764
768
  contents = cat.get_dir_contents(path_obj, recursive=recursive)
765
769
  return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Dir)]
@@ -24,7 +24,7 @@ def md_from_dict(data_class_type: type[T], data: Any) -> T:
24
24
  """Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
25
25
  if dataclasses.is_dataclass(data_class_type):
26
26
  fieldtypes = get_type_hints(data_class_type)
27
- return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data}) # type: ignore[return-value]
27
+ return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
28
28
 
29
29
  origin = typing.get_origin(data_class_type)
30
30
  if origin is not None:
pixeltable/plan.py CHANGED
@@ -385,14 +385,7 @@ class Planner:
385
385
  TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
386
386
  )
387
387
 
388
- media_input_col_info = [
389
- exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
390
- for col_ref in row_builder.input_exprs
391
- if isinstance(col_ref, exprs.ColumnRef) and col_ref.col_type.is_media_type()
392
- ]
393
- if len(media_input_col_info) > 0:
394
- # prefetch external files for all input column refs
395
- plan = exec.CachePrefetchNode(tbl.id, media_input_col_info, input=plan)
388
+ plan = cls._insert_prefetch_node(tbl.id, row_builder.input_exprs, input_node=plan)
396
389
 
397
390
  computed_exprs = row_builder.output_exprs - row_builder.input_exprs
398
391
  if len(computed_exprs) > 0:
@@ -789,15 +782,13 @@ class Planner:
789
782
 
790
783
  @classmethod
791
784
  def _insert_prefetch_node(
792
- cls, tbl_id: UUID, row_builder: exprs.RowBuilder, input_node: exec.ExecNode
785
+ cls, tbl_id: UUID, expressions: Iterable[exprs.Expr], input_node: exec.ExecNode
793
786
  ) -> exec.ExecNode:
794
- """Returns a CachePrefetchNode into the plan if needed, otherwise returns input"""
787
+ """Return a CachePrefetchNode if needed, otherwise return input"""
795
788
  # we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
796
789
  # of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
797
790
  # aren't explicitly captured as dependencies
798
- media_col_refs = [
799
- e for e in list(row_builder.unique_exprs) if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()
800
- ]
791
+ media_col_refs = [e for e in expressions if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()]
801
792
  if len(media_col_refs) == 0:
802
793
  return input_node
803
794
  # we need to prefetch external files for media column types
@@ -967,7 +958,7 @@ class Planner:
967
958
  stratify_exprs=analyzer.stratify_exprs,
968
959
  )
969
960
 
970
- plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
961
+ plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder.unique_exprs, plan)
971
962
 
972
963
  if analyzer.group_by_clause is not None:
973
964
  # we're doing grouping aggregation; the input of the AggregateNode are the grouping exprs plus the
@@ -1,7 +1,6 @@
1
1
  import base64
2
2
  import datetime
3
3
  import io
4
- import itertools
5
4
  import json
6
5
  import logging
7
6
  import tarfile
@@ -237,8 +236,7 @@ class TablePackager:
237
236
  - Videos are replaced by their first frame and resized as above
238
237
  - Documents are replaced by a thumbnail as a base64-encoded webp
239
238
  """
240
- # First 8 columns
241
- preview_cols = dict(itertools.islice(self.table._get_schema().items(), 0, 8))
239
+ preview_cols = self.table._get_schema()
242
240
  select_list = [self.table[col_name] for col_name in preview_cols]
243
241
  # First 5 rows
244
242
  rows = list(self.table.select(*select_list).head(n=5))
@@ -369,7 +367,7 @@ class TableRestorer:
369
367
  with cat.begin_xact(for_write=True):
370
368
  # Create (or update) the replica table and its ancestors, along with TableVersion instances for any
371
369
  # versions that have not been seen before.
372
- cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
370
+ cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
373
371
 
374
372
  # Now we need to load data for replica_tbl and its ancestors, except that we skip
375
373
  # replica_tbl itself if it's a pure snapshot.
@@ -572,16 +570,18 @@ class TableRestorer:
572
570
  for col_name in pydict:
573
571
  assert col_name in tv.store_tbl.sa_tbl.columns
574
572
  sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
575
- media_col_ids: dict[str, int] = {}
573
+ media_cols: dict[str, catalog.Column] = {}
576
574
  for col in tv.cols:
577
575
  if col.is_stored and col.col_type.is_media_type():
578
- media_col_ids[col.store_name()] = col.id
576
+ assert tv.id == col.tbl.id
577
+ assert tv.version == col.tbl.version
578
+ media_cols[col.store_name()] = col
579
579
 
580
580
  row_count = len(next(iter(pydict.values())))
581
581
  rows: list[dict[str, Any]] = []
582
582
  for i in range(row_count):
583
583
  row = {
584
- col_name: self.__from_pa_value(tv, col_vals[i], sql_types[col_name], media_col_ids.get(col_name))
584
+ col_name: self.__from_pa_value(col_vals[i], sql_types[col_name], media_cols.get(col_name))
585
585
  for col_name, col_vals in pydict.items()
586
586
  }
587
587
  rows.append(row)
@@ -589,19 +589,19 @@ class TableRestorer:
589
589
  return rows
590
590
 
591
591
  def __from_pa_value(
592
- self, tv: catalog.TableVersion, val: Any, sql_type: sql.types.TypeEngine[Any], media_col_id: Optional[int]
592
+ self, val: Any, sql_type: sql.types.TypeEngine[Any], media_col: Optional[catalog.Column]
593
593
  ) -> Any:
594
594
  if val is None:
595
595
  return None
596
596
  if isinstance(sql_type, sql.JSON):
597
597
  return json.loads(val)
598
- if media_col_id is not None:
599
- assert isinstance(val, str)
600
- return self.__relocate_media_file(tv, media_col_id, val)
598
+ if media_col is not None:
599
+ return self.__relocate_media_file(media_col, val)
601
600
  return val
602
601
 
603
- def __relocate_media_file(self, tv: catalog.TableVersion, media_col_id: int, url: str) -> str:
602
+ def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
604
603
  # If this is a pxtmedia:// URL, relocate it
604
+ assert isinstance(url, str)
605
605
  parsed_url = urllib.parse.urlparse(url)
606
606
  assert parsed_url.scheme != 'file' # These should all have been converted to pxtmedia:// URLs
607
607
  if parsed_url.scheme == 'pxtmedia':
@@ -610,7 +610,7 @@ class TableRestorer:
610
610
  # in self.media_files.
611
611
  src_path = self.tmp_dir / 'media' / parsed_url.netloc
612
612
  # Move the file to the media store and update the URL.
613
- self.media_files[url] = MediaStore.relocate_local_media_file(src_path, tv.id, media_col_id, tv.version)
613
+ self.media_files[url] = MediaStore.relocate_local_media_file(src_path, media_col)
614
614
  return self.media_files[url]
615
615
  # For any type of URL other than a local file, just return the URL as-is.
616
616
  return url
pixeltable/store.py CHANGED
@@ -123,15 +123,20 @@ class StoreBase:
123
123
  def _storage_name(self) -> str:
124
124
  """Return the name of the data store table"""
125
125
 
126
- def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column, v_min: int) -> str:
127
- return MediaStore.move_tmp_media_file(file_url, self.tbl_version.id, col.id, v_min)
126
+ def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column) -> str:
127
+ src_path = MediaStore.resolve_tmp_url(file_url)
128
+ if src_path is None:
129
+ return file_url
130
+ assert col.tbl.id == self.tbl_version.id # Ensure the column belongs to the same table as this store
131
+ new_file_url = MediaStore.relocate_local_media_file(src_path, col)
132
+ return new_file_url
128
133
 
129
134
  def _move_tmp_media_files(
130
135
  self, table_row: list[Any], media_cols_by_sql_idx: dict[int, catalog.Column], v_min: int
131
136
  ) -> None:
132
137
  """Move tmp media files that we generated to a permanent location"""
133
138
  for n, col in media_cols_by_sql_idx.items():
134
- table_row[n] = self._move_tmp_media_file(table_row[n], col, v_min)
139
+ table_row[n] = self._move_tmp_media_file(table_row[n], col)
135
140
 
136
141
  def count(self) -> int:
137
142
  """Return the number of rows visible in self.tbl_version"""
@@ -259,9 +264,7 @@ class StoreBase:
259
264
  raise excs.Error(f'Error while evaluating computed column {col.name!r}:\n{exc}') from exc
260
265
  table_row, num_row_exc = row_builder.create_table_row(row, None, row.pk)
261
266
  if col.col_type.is_media_type():
262
- table_row[tmp_val_col_sql_idx] = self._move_tmp_media_file(
263
- table_row[tmp_val_col_sql_idx], col, row.pk[-1]
264
- )
267
+ table_row[tmp_val_col_sql_idx] = self._move_tmp_media_file(table_row[tmp_val_col_sql_idx], col)
265
268
  num_excs += num_row_exc
266
269
  batch_table_rows.append(tuple(table_row))
267
270
 
pixeltable/type_system.py CHANGED
@@ -5,6 +5,7 @@ import datetime
5
5
  import enum
6
6
  import io
7
7
  import json
8
+ import types
8
9
  import typing
9
10
  import urllib.parse
10
11
  import urllib.request
@@ -307,7 +308,7 @@ class ColumnType:
307
308
  """
308
309
  origin = typing.get_origin(t)
309
310
  type_args = typing.get_args(t)
310
- if origin is typing.Union:
311
+ if origin in (typing.Union, types.UnionType):
311
312
  # Check if `t` has the form Optional[T].
312
313
  if len(type_args) == 2 and type(None) in type_args:
313
314
  # `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
@@ -214,7 +214,7 @@ class FileCache:
214
214
  new_path = entry.path
215
215
  os.rename(str(path), str(new_path))
216
216
  new_path.touch(exist_ok=True)
217
- _logger.debug(f'added entry for cell {url} to file cache')
217
+ _logger.debug(f'FileCache: cached url {url} with file name {new_path}')
218
218
  return new_path
219
219
 
220
220
  def ensure_capacity(self, size: int) -> None:
@@ -2,7 +2,7 @@ import http
2
2
  import http.server
3
3
  import logging
4
4
  import pathlib
5
- import urllib
5
+ import urllib.request
6
6
  from typing import Any
7
7
 
8
8
  _logger = logging.getLogger('pixeltable.http.server')
@@ -36,8 +36,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
36
36
  path = path.split('?', 1)[0]
37
37
  path = path.split('#', 1)[0]
38
38
 
39
- path = pathlib.Path(urllib.request.url2pathname(path))
40
- return str(path)
39
+ return str(pathlib.Path(urllib.request.url2pathname(path)))
41
40
 
42
41
  def log_message(self, format: str, *args: Any) -> None:
43
42
  """override logging to stderr in http.server.BaseHTTPRequestHandler"""
@@ -1,102 +1,147 @@
1
+ from __future__ import annotations
2
+
1
3
  import glob
2
4
  import os
3
5
  import re
4
6
  import shutil
5
- import urllib
7
+ import urllib.parse
8
+ import urllib.request
6
9
  import uuid
7
10
  from collections import defaultdict
8
11
  from pathlib import Path
9
- from typing import Optional
12
+ from typing import TYPE_CHECKING, Optional
10
13
  from uuid import UUID
11
14
 
12
- from pixeltable.env import Env
15
+ import PIL.Image
16
+
17
+ from pixeltable import env
18
+
19
+ if TYPE_CHECKING:
20
+ from pixeltable.catalog import Column
13
21
 
14
22
 
15
23
  class MediaStore:
16
24
  """
17
25
  Utilities to manage media files stored in Env.media_dir
18
26
 
19
- Media file names are a composite of: table id, column id, version, uuid:
20
- the table id/column id/version are redundant but useful for identifying all files for a table
27
+ Media file names are a composite of: table id, column id, tbl_version, new uuid:
28
+ the table id/column id/tbl_version are redundant but useful for identifying all files for a table
21
29
  or all files created for a particular version of a table
22
30
  """
23
31
 
24
32
  pattern = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
25
33
 
26
34
  @classmethod
27
- def prepare_media_path(cls, tbl_id: UUID, col_id: int, version: int, ext: Optional[str] = None) -> Path:
35
+ def _media_dir(cls) -> Path:
36
+ """Returns the media directory path."""
37
+ return env.Env.get().media_dir
38
+
39
+ @classmethod
40
+ def _tmp_dir(cls) -> Path:
41
+ """Returns the temporary directory path."""
42
+ return env.Env.get().tmp_dir
43
+
44
+ @classmethod
45
+ def _prepare_media_path(cls, col: Column, ext: Optional[str] = None) -> Path:
28
46
  """
29
47
  Construct a new, unique Path name for a persisted media file, and create the parent directory
30
48
  for the new Path if it does not already exist. The Path will reside in
31
49
  the environment's media_dir.
32
50
  """
33
51
  id_hex = uuid.uuid4().hex
34
- parent = Env.get().media_dir / tbl_id.hex / id_hex[:2] / id_hex[:4]
52
+ parent = cls._media_dir() / col.tbl.id.hex / id_hex[:2] / id_hex[:4]
35
53
  parent.mkdir(parents=True, exist_ok=True)
36
- return parent / f'{tbl_id.hex}_{col_id}_{version}_{id_hex}{ext or ""}'
54
+ return parent / f'{col.tbl.id.hex}_{col.id}_{col.tbl.version}_{id_hex}{ext or ""}'
37
55
 
38
56
  @classmethod
39
- def move_tmp_media_file(cls, file_url: Optional[str], tbl_id: UUID, col_id: int, v_min: int) -> Optional[str]:
40
- """Move a tmp media file with given url into the MediaStore, and return new url
41
- If it is not a tmp file in the tmp_dir, return the original url.
57
+ def resolve_tmp_url(cls, file_url: Optional[str]) -> Optional[Path]:
58
+ """Return path if the given url is a tmp file.
42
59
 
43
60
  Args:
44
- file_url: URL of the tmp media file to move
45
- tbl_id: Table ID to associate with the media file
46
- col_id: Column ID to associate with the media file
47
- v_min: Version number to associate with the media file
61
+ file_url: URL of the tmp media file to check
48
62
 
49
63
  Returns:
50
- URL of the media final location of the file
64
+ If the file_url is a tmp file, return a Path() to the tmp file, None, otherwise
51
65
  """
52
66
  if file_url is None:
53
67
  return None
54
68
  assert isinstance(file_url, str), type(file_url)
55
- pxt_tmp_dir = str(Env.get().tmp_dir)
56
69
  parsed = urllib.parse.urlparse(file_url)
57
70
  # We should never be passed a local file path here. The "len > 1" ensures that Windows
58
71
  # file paths aren't mistaken for URLs with a single-character scheme.
59
72
  assert len(parsed.scheme) > 1, file_url
60
73
  if parsed.scheme != 'file':
61
74
  # remote url
62
- return file_url
63
- file_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
64
- if not file_path.startswith(pxt_tmp_dir):
75
+ return None
76
+ src_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
77
+ pxt_tmp_dir = str(cls._tmp_dir())
78
+ if not src_path.startswith(pxt_tmp_dir):
65
79
  # not a tmp file
66
- return file_url
67
- new_file_url = cls.relocate_local_media_file(Path(file_path), tbl_id, col_id, v_min)
68
- return new_file_url
80
+ return None
81
+ return Path(src_path)
69
82
 
70
83
  @classmethod
71
- def relocate_local_media_file(cls, src_path: Path, tbl_id: UUID, col_id: int, tbl_version: int) -> str:
72
- dest_path = MediaStore.prepare_media_path(tbl_id, col_id, tbl_version, ext=src_path.suffix)
84
+ def relocate_local_media_file(cls, src_path: Path, col: Column) -> str:
85
+ """Relocate a local file to the MediaStore, and return its new URL"""
86
+ dest_path = cls._prepare_media_path(col, ext=src_path.suffix)
73
87
  src_path.rename(dest_path)
74
88
  return urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
75
89
 
76
90
  @classmethod
77
- def save_media_file(cls, file_data: bytes, tbl_id: UUID, col_id: int, tbl_version: int) -> Path:
78
- """Save a media binary data to a file in the MediaStore."""
91
+ def save_media_object(cls, data: bytes | PIL.Image.Image, col: Column, format: Optional[str]) -> tuple[Path, str]:
92
+ """Save a media data to a file in the MediaStore
93
+ Returns:
94
+ dest_path: Path to the saved media file
95
+ url: URL of the saved media file
96
+ """
97
+ assert col.col_type.is_media_type(), f'MediaStore: request to store non media_type Column {col.name}'
98
+ dest_path = cls._prepare_media_path(col)
99
+ if isinstance(data, bytes):
100
+ dest_path = cls._save_binary_media_file(data, dest_path, format)
101
+ elif isinstance(data, PIL.Image.Image):
102
+ dest_path = cls._save_pil_image_file(data, dest_path, format)
103
+ else:
104
+ raise ValueError(f'Unsupported media object type: {type(data)}')
105
+ url = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
106
+ return dest_path, url
107
+
108
+ @classmethod
109
+ def _save_binary_media_file(cls, file_data: bytes, dest_path: Path, format: Optional[str]) -> Path:
110
+ """Save a media binary data to a file in the MediaStore. format is ignored for binary data."""
79
111
  assert isinstance(file_data, bytes)
80
- media_path = cls.prepare_media_path(tbl_id, col_id, tbl_version)
81
- with open(media_path, 'wb') as f:
112
+ with open(dest_path, 'wb') as f:
82
113
  f.write(file_data)
83
114
  f.flush() # Ensures Python buffers are written to OS
84
115
  os.fsync(f.fileno()) # Forces OS to write to physical storage
85
- return media_path
116
+ return dest_path
117
+
118
+ @classmethod
119
+ def _save_pil_image_file(cls, image: PIL.Image.Image, dest_path: Path, format: Optional[str]) -> Path:
120
+ """Save a PIL Image to a file in the MediaStore with the specified format."""
121
+ if dest_path.suffix != f'.{format}':
122
+ dest_path = dest_path.with_name(f'{dest_path.name}.{format}')
123
+
124
+ with open(dest_path, 'wb') as f:
125
+ image.save(f, format=format)
126
+ f.flush() # Ensures Python buffers are written to OS
127
+ os.fsync(f.fileno()) # Forces OS to write to physical storage
128
+ return dest_path
86
129
 
87
130
  @classmethod
88
- def delete(cls, tbl_id: UUID, version: Optional[int] = None) -> None:
89
- """Delete all files belonging to tbl_id. If version is not None, delete
90
- only those files belonging to the specified version."""
131
+ def delete(cls, tbl_id: UUID, tbl_version: Optional[int] = None) -> None:
132
+ """Delete all files belonging to tbl_id. If tbl_version is not None, delete
133
+ only those files belonging to the specified tbl_version."""
91
134
  assert tbl_id is not None
92
- if version is None:
135
+ if tbl_version is None:
93
136
  # Remove the entire folder for this table id.
94
- path = Env.get().media_dir / tbl_id.hex
137
+ path = cls._media_dir() / tbl_id.hex
95
138
  if path.exists():
96
139
  shutil.rmtree(path)
97
140
  else:
98
- # Remove only the elements for the specified version.
99
- paths = glob.glob(str(Env.get().media_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{version}_*', recursive=True)
141
+ # Remove only the elements for the specified tbl_version.
142
+ paths = glob.glob(
143
+ str(cls._media_dir() / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{tbl_version}_*', recursive=True
144
+ )
100
145
  for p in paths:
101
146
  os.remove(p)
102
147
 
@@ -105,12 +150,12 @@ class MediaStore:
105
150
  """
106
151
  Return number of files for given tbl_id.
107
152
  """
108
- paths = glob.glob(str(Env.get().media_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
153
+ paths = glob.glob(str(cls._media_dir() / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
109
154
  return len(paths)
110
155
 
111
156
  @classmethod
112
157
  def stats(cls) -> list[tuple[UUID, int, int, int]]:
113
- paths = glob.glob(str(Env.get().media_dir) + '/**', recursive=True)
158
+ paths = glob.glob(str(cls._media_dir()) + '/**', recursive=True)
114
159
  # key: (tbl_id, col_id), value: (num_files, size)
115
160
  d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
116
161
  for p in paths:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: pixeltable
3
- Version: 0.4.4
3
+ Version: 0.4.5
4
4
  Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
5
5
  License: Apache-2.0
6
6
  Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai