pixeltable 0.2.19__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

pixeltable/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  # These version placeholders will be replaced during build.
2
- __version__ = "0.2.19"
3
- __version_tuple__ = (0, 2, 19)
2
+ __version__ = "0.2.20"
3
+ __version_tuple__ = (0, 2, 20)
@@ -10,6 +10,7 @@ import pixeltable as pxt
10
10
  import pixeltable.type_system as ts
11
11
  from pixeltable import exceptions as excs
12
12
  from pixeltable.env import Env
13
+ from pixeltable.utils.filecache import FileCache
13
14
 
14
15
  from .catalog import Catalog
15
16
  from .globals import UpdateStatus
@@ -101,21 +102,22 @@ class InsertableTable(Table):
101
102
  if not isinstance(row, dict):
102
103
  raise excs.Error('rows must be a list of dictionaries')
103
104
  self._validate_input_rows(rows)
104
- result = self._tbl_version.insert(rows, None, print_stats=print_stats, fail_on_exception=fail_on_exception)
105
+ status = self._tbl_version.insert(rows, None, print_stats=print_stats, fail_on_exception=fail_on_exception)
105
106
 
106
- if result.num_excs == 0:
107
+ if status.num_excs == 0:
107
108
  cols_with_excs_str = ''
108
109
  else:
109
110
  cols_with_excs_str = \
110
- f' across {len(result.cols_with_excs)} column{"" if len(result.cols_with_excs) == 1 else "s"}'
111
- cols_with_excs_str += f' ({", ".join(result.cols_with_excs)})'
111
+ f' across {len(status.cols_with_excs)} column{"" if len(status.cols_with_excs) == 1 else "s"}'
112
+ cols_with_excs_str += f' ({", ".join(status.cols_with_excs)})'
112
113
  msg = (
113
- f'Inserted {result.num_rows} row{"" if result.num_rows == 1 else "s"} '
114
- f'with {result.num_excs} error{"" if result.num_excs == 1 else "s"}{cols_with_excs_str}.'
114
+ f'Inserted {status.num_rows} row{"" if status.num_rows == 1 else "s"} '
115
+ f'with {status.num_excs} error{"" if status.num_excs == 1 else "s"}{cols_with_excs_str}.'
115
116
  )
116
117
  print(msg)
117
118
  _logger.info(f'InsertableTable {self._name}: {msg}')
118
- return result
119
+ FileCache.get().emit_eviction_warnings()
120
+ return status
119
121
 
120
122
  def _validate_input_rows(self, rows: List[Dict[str, Any]]) -> None:
121
123
  """Verify that the input rows match the table schema"""
@@ -20,6 +20,7 @@ import pixeltable.exprs as exprs
20
20
  import pixeltable.index as index
21
21
  import pixeltable.metadata.schema as schema
22
22
  import pixeltable.type_system as ts
23
+ from pixeltable.utils.filecache import FileCache
23
24
 
24
25
  from .column import Column
25
26
  from .globals import _ROWID_COLUMN_NAME, UpdateStatus, is_system_column_name, is_valid_identifier
@@ -33,7 +34,12 @@ if TYPE_CHECKING:
33
34
  _logger = logging.getLogger('pixeltable')
34
35
 
35
36
  class Table(SchemaObject):
36
- """Base class for table objects (base tables, views, snapshots)."""
37
+ """
38
+ Base class for table objects (base tables, views, snapshots).
39
+
40
+ Every user-invoked operation that runs an ExecNode tree (directly or indirectly) needs to call
41
+ FileCache.emit_eviction_warnings() at the end of the operation.
42
+ """
37
43
 
38
44
  def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath):
39
45
  super().__init__(id, name, dir_id)
@@ -374,7 +380,10 @@ class Table(SchemaObject):
374
380
 
375
381
  new_col = self._create_columns({col_name: col_schema})[0]
376
382
  self._verify_column(new_col, set(self._schema.keys()), set(self._query_names))
377
- return self._tbl_version.add_column(new_col, print_stats=print_stats)
383
+ status = self._tbl_version.add_column(new_col, print_stats=print_stats)
384
+ FileCache.get().emit_eviction_warnings()
385
+ return status
386
+
378
387
 
379
388
  @classmethod
380
389
  def _validate_column_spec(cls, name: str, spec: dict[str, Any]) -> None:
@@ -587,6 +596,7 @@ class Table(SchemaObject):
587
596
  idx = EmbeddingIndex(col, metric=metric, string_embed=string_embed, image_embed=image_embed)
588
597
  status = self._tbl_version.add_index(col, idx_name=idx_name, idx=idx)
589
598
  # TODO: how to deal with exceptions here? drop the index and raise?
599
+ FileCache.get().emit_eviction_warnings()
590
600
 
591
601
  def drop_embedding_index(self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None) -> None:
592
602
  """Drop an embedding index from the table.
@@ -732,7 +742,9 @@ class Table(SchemaObject):
732
742
  >>> tbl.update({'int_col': tbl.int_col + 1}, where=tbl.int_col == 0)
733
743
  """
734
744
  self._check_is_dropped()
735
- return self._tbl_version.update(value_spec, where, cascade)
745
+ status = self._tbl_version.update(value_spec, where, cascade)
746
+ FileCache.get().emit_eviction_warnings()
747
+ return status
736
748
 
737
749
  def batch_update(
738
750
  self, rows: Iterable[dict[str, Any]], cascade: bool = True,
@@ -789,9 +801,11 @@ class Table(SchemaObject):
789
801
  missing_cols = pk_col_names - set(col.name for col in col_vals.keys())
790
802
  raise excs.Error(f'Primary key columns ({", ".join(missing_cols)}) missing in {row_spec}')
791
803
  row_updates.append(col_vals)
792
- return self._tbl_version.batch_update(
804
+ status = self._tbl_version.batch_update(
793
805
  row_updates, rowids, error_if_not_exists=if_not_exists == 'error',
794
806
  insert_if_not_exists=if_not_exists == 'insert', cascade=cascade)
807
+ FileCache.get().emit_eviction_warnings()
808
+ return status
795
809
 
796
810
  def delete(self, where: Optional['pixeltable.exprs.Expr'] = None) -> UpdateStatus:
797
811
  """Delete rows in this table.
pixeltable/env.py CHANGED
@@ -8,6 +8,7 @@ import importlib.util
8
8
  import inspect
9
9
  import logging
10
10
  import os
11
+ import shutil
11
12
  import subprocess
12
13
  import sys
13
14
  import threading
@@ -15,12 +16,12 @@ import uuid
15
16
  import warnings
16
17
  from dataclasses import dataclass
17
18
  from pathlib import Path
18
- from typing import TYPE_CHECKING, Any, Callable, Optional
19
+ from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar
19
20
  from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
20
21
 
21
22
  import pixeltable_pgserver
22
23
  import sqlalchemy as sql
23
- import yaml
24
+ import toml
24
25
  from tqdm import TqdmWarning
25
26
 
26
27
  import pixeltable.exceptions as excs
@@ -64,7 +65,7 @@ class Env:
64
65
  _log_to_stdout: bool
65
66
  _module_log_level: dict[str, int] # module name -> log level
66
67
  _config_file: Optional[Path]
67
- _config: Optional[dict[str, Any]]
68
+ _config: Optional[Config]
68
69
  _stdout_handler: logging.StreamHandler
69
70
  _initialized: bool
70
71
 
@@ -110,6 +111,7 @@ class Env:
110
111
  self._log_to_stdout = False
111
112
  self._module_log_level = {} # module name -> log level
112
113
 
114
+ # config
113
115
  self._config_file = None
114
116
  self._config = None
115
117
 
@@ -119,7 +121,8 @@ class Env:
119
121
  self._initialized = False
120
122
 
121
123
  @property
122
- def config(self):
124
+ def config(self) -> Config:
125
+ assert self._config is not None
123
126
  return self._config
124
127
 
125
128
  @property
@@ -227,30 +230,13 @@ class Env:
227
230
  home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
228
231
  assert self._home is None or self._home == home
229
232
  self._home = home
230
- self._config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self._home / 'config.yaml')))
233
+ self._config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self._home / 'config.toml')))
231
234
  self._media_dir = self._home / 'media'
232
235
  self._file_cache_dir = self._home / 'file_cache'
233
236
  self._dataset_cache_dir = self._home / 'dataset_cache'
234
237
  self._log_dir = self._home / 'logs'
235
238
  self._tmp_dir = self._home / 'tmp'
236
239
 
237
- # Read in the config
238
- if os.path.isfile(self._config_file):
239
- with open(self._config_file, 'r') as stream:
240
- try:
241
- self._config = yaml.safe_load(stream)
242
- except yaml.YAMLError as exc:
243
- self._logger.error(f'Could not read config file: {self._config_file}')
244
- self._config = {}
245
- else:
246
- self._config = {}
247
-
248
- # Disable spurious warnings
249
- warnings.simplefilter('ignore', category=TqdmWarning)
250
- if 'hide_warnings' in self._config and self._config['hide_warnings']:
251
- # Disable more warnings
252
- warnings.simplefilter('ignore', category=UserWarning)
253
-
254
240
  if self._home.exists() and not self._home.is_dir():
255
241
  raise RuntimeError(f'{self._home} is not a directory')
256
242
 
@@ -274,6 +260,22 @@ class Env:
274
260
  if not self._tmp_dir.exists():
275
261
  self._tmp_dir.mkdir()
276
262
 
263
+ # Read in the config
264
+ self._config = Config.from_file(self._config_file)
265
+ self._file_cache_size_g = self._config.get_float_value('file_cache_size_g')
266
+ if self._file_cache_size_g is None:
267
+ raise excs.Error(
268
+ 'pixeltable/file_cache_size_g is missing from configuration\n'
269
+ f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {self._config_file},\n'
270
+ 'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
271
+ )
272
+
273
+ # Disable spurious warnings
274
+ warnings.simplefilter('ignore', category=TqdmWarning)
275
+ if self._config.get_bool_value('hide_warnings'):
276
+ # Disable more warnings
277
+ warnings.simplefilter('ignore', category=UserWarning)
278
+
277
279
  # configure _logger to log to a file
278
280
  self._logfilename = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log'
279
281
  fh = logging.FileHandler(self._log_dir / self._logfilename, mode='w')
@@ -313,7 +315,7 @@ class Env:
313
315
  self._db_server = pixeltable_pgserver.get_server(self._pgdata_dir, cleanup_mode=None)
314
316
  self._db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
315
317
 
316
- tz_name = os.environ.get('PXT_TIME_ZONE', self._config.get('pxt_time_zone', None))
318
+ tz_name = self.config.get_string_value('time_zone')
317
319
  if tz_name is not None:
318
320
  # Validate tzname
319
321
  if not isinstance(tz_name, str):
@@ -440,21 +442,18 @@ class Env:
440
442
  if cl.client_obj is not None:
441
443
  return cl.client_obj # Already initialized
442
444
 
443
- # Construct a client. For each client parameter, first check if the parameter is in the environment;
444
- # if not, look in Pixeltable config from `config.yaml`.
445
+ # Construct a client, retrieving each parameter from config.
445
446
 
446
447
  init_kwargs: dict[str, str] = {}
447
448
  for param in cl.param_names:
448
- environ = f'{name.upper()}_{param.upper()}'
449
- if environ in os.environ:
450
- init_kwargs[param] = os.environ[environ]
451
- elif name.lower() in self._config and param in self._config[name.lower()]:
452
- init_kwargs[param] = self._config[name.lower()][param.lower()]
453
- if param not in init_kwargs or init_kwargs[param] == '':
449
+ arg = self._config.get_string_value(param, section=name)
450
+ if arg is not None and len(arg) > 0:
451
+ init_kwargs[param] = arg
452
+ else:
454
453
  raise excs.Error(
455
454
  f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
456
- f'To fix this, specify the `{environ}` environment variable, or put `{param.lower()}` in '
457
- f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.yaml.'
455
+ f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, or put `{param.lower()}` in '
456
+ f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
458
457
  )
459
458
 
460
459
  cl.client_obj = cl.init_fn(**init_kwargs)
@@ -506,7 +505,6 @@ class Env:
506
505
  self.__register_package('spacy')
507
506
  self.__register_package('tiktoken')
508
507
  self.__register_package('together')
509
- self.__register_package('toml')
510
508
  self.__register_package('torch')
511
509
  self.__register_package('torchvision')
512
510
  self.__register_package('transformers')
@@ -643,7 +641,7 @@ def register_client(name: str) -> Callable:
643
641
  Pixeltable will attempt to load the client parameters from config. For each
644
642
  config parameter:
645
643
  - If an environment variable named MY_CLIENT_API_KEY (for example) is set, use it;
646
- - Otherwise, look for 'api_key' in the 'my_client' section of config.yaml.
644
+ - Otherwise, look for 'api_key' in the 'my_client' section of config.toml.
647
645
 
648
646
  If all config parameters are found, Pixeltable calls the initialization function;
649
647
  otherwise it throws an exception.
@@ -660,6 +658,79 @@ def register_client(name: str) -> Callable:
660
658
  return decorator
661
659
 
662
660
 
661
+ class Config:
662
+ """
663
+ The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
664
+ configuration values, which can be set in the config file or as environment variables.
665
+ """
666
+ __config: dict[str, Any]
667
+
668
+ T = TypeVar('T')
669
+
670
+ @classmethod
671
+ def from_file(cls, path: Path) -> Config:
672
+ """
673
+ Loads configuration from the specified TOML file. If the file does not exist, it will be
674
+ created and populated with the default configuration.
675
+ """
676
+ if os.path.isfile(path):
677
+ with open(path, 'r') as stream:
678
+ try:
679
+ config_dict = toml.load(stream)
680
+ except Exception as exc:
681
+ raise excs.Error(f'Could not read config file: {str(path)}') from exc
682
+ else:
683
+ config_dict = cls.__create_default_config(path)
684
+ with open(path, 'w') as stream:
685
+ try:
686
+ toml.dump(config_dict, stream)
687
+ except Exception as exc:
688
+ raise excs.Error(f'Could not write config file: {str(path)}') from exc
689
+ logging.getLogger('pixeltable').info(f'Created default config file at: {str(path)}')
690
+ return cls(config_dict)
691
+
692
+ @classmethod
693
+ def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
694
+ free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
695
+ # Default cache size is 1/5 of free disk space
696
+ file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
697
+ return {
698
+ 'pixeltable': {
699
+ 'file_cache_size_g': round(file_cache_size_g, 1),
700
+ 'hide_warnings': False,
701
+ }
702
+ }
703
+
704
+ def __init__(self, config: dict[str, Any]) -> None:
705
+ self.__config = config
706
+
707
+ def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
708
+ env_var = f'{section.upper()}_{key.upper()}'
709
+ if env_var in os.environ:
710
+ value = os.environ[env_var]
711
+ elif section in self.__config and key in self.__config[section]:
712
+ value = self.__config[section][key]
713
+ else:
714
+ return None
715
+
716
+ try:
717
+ return expected_type(value) # type: ignore[call-arg]
718
+ except ValueError:
719
+ raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
720
+
721
+ def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
722
+ return self.get_value(key, str, section)
723
+
724
+ def get_int_value(self, key: str, section: str = 'pixeltable') -> Optional[int]:
725
+ return self.get_value(key, int, section)
726
+
727
+ def get_float_value(self, key: str, section: str = 'pixeltable') -> Optional[float]:
728
+ return self.get_value(key, float, section)
729
+
730
+ def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
731
+ return self.get_value(key, bool, section)
732
+
733
+
663
734
  _registered_clients: dict[str, ApiClient] = {}
664
735
 
665
736
 
pixeltable/exceptions.py CHANGED
@@ -1,6 +1,9 @@
1
- from typing import List, Any
2
- from types import TracebackType
3
1
  from dataclasses import dataclass
2
+ from types import TracebackType
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ if TYPE_CHECKING:
6
+ from pixeltable import exprs
4
7
 
5
8
 
6
9
  class Error(Exception):
@@ -9,11 +12,11 @@ class Error(Exception):
9
12
 
10
13
  @dataclass
11
14
  class ExprEvalError(Exception):
12
- expr: Any # exprs.Expr, but we're not importing pixeltable.exprs to avoid circular imports
15
+ expr: 'exprs.Expr'
13
16
  expr_msg: str
14
17
  exc: Exception
15
18
  exc_tb: TracebackType
16
- input_vals: List[Any]
19
+ input_vals: list[Any]
17
20
  row_num: int
18
21
 
19
22
 
@@ -1,10 +1,12 @@
1
- from typing import Generator, Optional
1
+ import inspect
2
+ from typing import Iterator, Optional
2
3
 
3
- from .data_row_batch import DataRowBatch
4
- from .exec_node import ExecNode
5
4
  import pixeltable.catalog as catalog
6
- import pixeltable.exprs as exprs
7
5
  import pixeltable.exceptions as excs
6
+ import pixeltable.exprs as exprs
7
+
8
+ from .data_row_batch import DataRowBatch
9
+ from .exec_node import ExecNode
8
10
 
9
11
 
10
12
  class ComponentIterationNode(ExecNode):
@@ -12,7 +14,7 @@ class ComponentIterationNode(ExecNode):
12
14
 
13
15
  Returns row batches of OUTPUT_BATCH_SIZE size.
14
16
  """
15
- OUTPUT_BATCH_SIZE = 1024
17
+ __OUTPUT_BATCH_SIZE = 1024
16
18
 
17
19
  def __init__(self, view: catalog.TableVersion, input: ExecNode):
18
20
  assert view.is_component_view()
@@ -23,57 +25,76 @@ class ComponentIterationNode(ExecNode):
23
25
  self.iterator_args = iterator_args[0]
24
26
  assert isinstance(self.iterator_args, exprs.InlineDict)
25
27
  self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
26
- self.iterator_output_schema, self.unstored_column_names = \
28
+ self.iterator_output_schema, self.unstored_column_names = (
27
29
  self.view.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
30
+ )
28
31
  self.iterator_output_fields = list(self.iterator_output_schema.keys())
29
- self.iterator_output_cols = \
30
- {field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields}
32
+ self.iterator_output_cols = {
33
+ field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields
34
+ }
31
35
  # referenced iterator output fields
32
36
  self.refd_output_slot_idxs = {
33
37
  e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
34
38
  if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
35
39
  }
36
- self._output: Optional[Generator[DataRowBatch, None, None]] = None
40
+ self.__output: Optional[Iterator[DataRowBatch]] = None
37
41
 
38
- def _output_batches(self) -> Generator[DataRowBatch, None, None]:
42
+ def __output_batches(self) -> Iterator[DataRowBatch]:
39
43
  output_batch = DataRowBatch(self.view, self.row_builder)
40
44
  for input_batch in self.input:
41
45
  for input_row in input_batch:
42
46
  self.row_builder.eval(input_row, self.iterator_args_ctx)
43
47
  iterator_args = input_row[self.iterator_args.slot_idx]
44
- iterator = self.view.iterator_cls(**iterator_args)
45
- for pos, component_dict in enumerate(iterator):
46
- output_row = output_batch.add_row()
47
- input_row.copy(output_row)
48
- # we're expanding the input and need to add the iterator position to the pk
49
- pk = output_row.pk[:-1] + (pos,) + output_row.pk[-1:]
50
- output_row.set_pk(pk)
51
-
52
- # verify and copy component_dict fields to their respective slots in output_row
53
- for field_name, field_val in component_dict.items():
54
- if field_name not in self.iterator_output_fields:
55
- raise excs.Error(
56
- f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
57
- if field_name not in self.refd_output_slot_idxs:
58
- # we can ignore this
59
- continue
60
- output_col = self.iterator_output_cols[field_name]
61
- output_col.col_type.validate_literal(field_val)
62
- output_row[self.refd_output_slot_idxs[field_name]] = field_val
63
- if len(component_dict) != len(self.iterator_output_fields):
64
- missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
65
- raise excs.Error(
66
- f'Invalid output of {self.view.iterator_cls.__name__}: '
67
- f'missing fields {", ".join(missing_fields)}')
68
-
69
- if len(output_batch) == self.OUTPUT_BATCH_SIZE:
70
- yield output_batch
71
- output_batch = DataRowBatch(self.view, self.row_builder)
48
+ assert isinstance(iterator_args, dict)
49
+ # We need to ensure that all of the required (non-nullable) parameters of the iterator are
50
+ # specified and are not null. If any of them are null, then we skip this row (i.e., we emit 0
51
+ # output rows for this input row).
52
+ if self.__non_nullable_args_specified(iterator_args):
53
+ iterator = self.view.iterator_cls(**iterator_args)
54
+ for pos, component_dict in enumerate(iterator):
55
+ output_row = output_batch.add_row()
56
+ input_row.copy(output_row)
57
+ # we're expanding the input and need to add the iterator position to the pk
58
+ self.__populate_output_row(output_row, pos, component_dict)
59
+ if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
60
+ yield output_batch
61
+ output_batch = DataRowBatch(self.view, self.row_builder)
72
62
 
73
63
  if len(output_batch) > 0:
74
64
  yield output_batch
75
65
 
66
+ def __non_nullable_args_specified(self, iterator_args: dict) -> bool:
67
+ """
68
+ Returns true if all non-nullable iterator arguments are not `None`.
69
+ """
70
+ input_schema = self.view.iterator_cls.input_schema()
71
+ for arg_name, arg_value in iterator_args.items():
72
+ col_type = input_schema[arg_name]
73
+ if arg_value is None and not col_type.nullable:
74
+ return False
75
+ return True
76
+
77
+ def __populate_output_row(self, output_row: exprs.DataRow, pos: int, component_dict: dict) -> None:
78
+ pk = output_row.pk[:-1] + (pos,) + output_row.pk[-1:]
79
+ output_row.set_pk(pk)
80
+ # verify and copy component_dict fields to their respective slots in output_row
81
+ for field_name, field_val in component_dict.items():
82
+ if field_name not in self.iterator_output_fields:
83
+ raise excs.Error(
84
+ f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
85
+ if field_name not in self.refd_output_slot_idxs:
86
+ # we can ignore this
87
+ continue
88
+ output_col = self.iterator_output_cols[field_name]
89
+ output_col.col_type.validate_literal(field_val)
90
+ output_row[self.refd_output_slot_idxs[field_name]] = field_val
91
+ if len(component_dict) != len(self.iterator_output_fields):
92
+ missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
93
+ raise excs.Error(
94
+ f'Invalid output of {self.view.iterator_cls.__name__}: '
95
+ f'missing fields {", ".join(missing_fields)}')
96
+
76
97
  def __next__(self) -> DataRowBatch:
77
- if self._output is None:
78
- self._output = self._output_batches()
79
- return next(self._output)
98
+ if self.__output is None:
99
+ self.__output = self.__output_batches()
100
+ return next(self.__output)
@@ -105,12 +105,9 @@ class JsonPath(Expr):
105
105
  return JsonPath(self._anchor, self.path_elements + [name])
106
106
 
107
107
  def __getitem__(self, index: object) -> 'JsonPath':
108
- if isinstance(index, str):
109
- if index != '*':
110
- raise excs.Error(f'Invalid json list index: {index}')
111
- elif not isinstance(index, (int, slice)):
112
- raise excs.Error(f'Invalid json list index: {index}')
113
- return JsonPath(self._anchor, self.path_elements + [index])
108
+ if isinstance(index, (int, slice, str)):
109
+ return JsonPath(self._anchor, self.path_elements + [index])
110
+ raise excs.Error(f'Invalid json list index: {index}')
114
111
 
115
112
  def __rshift__(self, other: object) -> 'JsonMapper':
116
113
  rhs_expr = Expr.from_object(other)
@@ -7,13 +7,15 @@ the [Working with Together AI](https://pixeltable.readme.io/docs/together-ai) tu
7
7
 
8
8
  import base64
9
9
  import io
10
- from typing import TYPE_CHECKING, Callable, Optional
10
+ from typing import TYPE_CHECKING, Callable, Optional, TypeVar
11
11
 
12
12
  import numpy as np
13
13
  import PIL.Image
14
+ import requests
14
15
  import tenacity
15
16
 
16
17
  import pixeltable as pxt
18
+ import pixeltable.exceptions as excs
17
19
  from pixeltable import env
18
20
  from pixeltable.func import Batch
19
21
  from pixeltable.utils.code import local_public_names
@@ -32,7 +34,10 @@ def _together_client() -> 'together.Together':
32
34
  return env.Env.get().get_client('together')
33
35
 
34
36
 
35
- def _retry(fn: Callable) -> Callable:
37
+ T = TypeVar('T')
38
+
39
+
40
+ def _retry(fn: Callable[..., T]) -> Callable[..., T]:
36
41
  import together
37
42
  return tenacity.retry(
38
43
  retry=tenacity.retry_if_exception_type(together.error.RateLimitError),
@@ -249,20 +254,29 @@ def image_generations(
249
254
  The generated image.
250
255
 
251
256
  Examples:
252
- Add a computed column that applies the model `runwayml/stable-diffusion-v1-5`
257
+ Add a computed column that applies the model `stabilityai/stable-diffusion-xl-base-1.0`
253
258
  to an existing Pixeltable column `tbl.prompt` of the table `tbl`:
254
259
 
255
- >>> tbl['response'] = image_generations(tbl.prompt, model='runwayml/stable-diffusion-v1-5')
260
+ >>> tbl['response'] = image_generations(tbl.prompt, model='stabilityai/stable-diffusion-xl-base-1.0')
256
261
  """
257
- # TODO(aaron-siegel): Decompose CPU/GPU ops into separate functions
258
262
  result = _retry(_together_client().images.generate)(
259
263
  prompt=prompt, model=model, steps=steps, seed=seed, height=height, width=width, negative_prompt=negative_prompt
260
264
  )
261
- b64_str = result.data[0].b64_json
262
- b64_bytes = base64.b64decode(b64_str)
263
- img = PIL.Image.open(io.BytesIO(b64_bytes))
264
- img.load()
265
- return img
265
+ if result.data[0].b64_json is not None:
266
+ b64_bytes = base64.b64decode(result.data[0].b64_json)
267
+ img = PIL.Image.open(io.BytesIO(b64_bytes))
268
+ img.load()
269
+ return img
270
+ if result.data[0].url is not None:
271
+ try:
272
+ resp = requests.get(result.data[0].url)
273
+ with io.BytesIO(resp.content) as fp:
274
+ image = PIL.Image.open(fp)
275
+ image.load()
276
+ return image
277
+ except Exception as exc:
278
+ raise excs.Error('Failed to download generated image from together.ai.') from exc
279
+ raise excs.Error('Response does not contain a generated image.')
266
280
 
267
281
 
268
282
  __all__ = local_public_names(__name__)
pixeltable/globals.py CHANGED
@@ -16,6 +16,7 @@ from pixeltable.dataframe import DataFrameResultSet
16
16
  from pixeltable.env import Env
17
17
  from pixeltable.iterators import ComponentIterator
18
18
  from pixeltable.metadata import schema
19
+ from pixeltable.utils.filecache import FileCache
19
20
 
20
21
  _logger = logging.getLogger('pixeltable')
21
22
 
@@ -193,6 +194,7 @@ def create_view(
193
194
  )
194
195
  Catalog.get().paths[path] = view
195
196
  _logger.info(f'Created view `{path_str}`.')
197
+ FileCache.get().emit_eviction_warnings()
196
198
  return view
197
199
 
198
200
 
pixeltable/io/globals.py CHANGED
@@ -43,7 +43,7 @@ def create_label_studio_project(
43
43
  The API key and URL for a valid Label Studio server must be specified in Pixeltable config. Either:
44
44
 
45
45
  * Set the `LABEL_STUDIO_API_KEY` and `LABEL_STUDIO_URL` environment variables; or
46
- * Specify `api_key` and `url` fields in the `label-studio` section of `$PIXELTABLE_HOME/config.yaml`.
46
+ * Specify `api_key` and `url` fields in the `label-studio` section of `$PIXELTABLE_HOME/config.toml`.
47
47
 
48
48
  __Requirements:__
49
49
 
@@ -34,9 +34,7 @@ _hf_to_pxt: dict[str, ts.ColumnType] = {
34
34
  }
35
35
 
36
36
 
37
- def _to_pixeltable_type(
38
- feature_type: Union[datasets.ClassLabel, datasets.Value, datasets.Sequence],
39
- ) -> Optional[ts.ColumnType]:
37
+ def _to_pixeltable_type(feature_type: Any) -> Optional[ts.ColumnType]:
40
38
  """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
41
39
  import datasets
42
40
 
@@ -51,6 +49,8 @@ def _to_pixeltable_type(
51
49
  dtype = _to_pixeltable_type(feature_type.feature)
52
50
  length = feature_type.length if feature_type.length != -1 else None
53
51
  return ts.ArrayType(shape=(length,), dtype=dtype)
52
+ elif isinstance(feature_type, datasets.Image):
53
+ return ts.ImageType(nullable=True)
54
54
  else:
55
55
  return None
56
56
 
@@ -166,7 +166,7 @@ class DocumentSplitter(ComponentIterator):
166
166
  return {
167
167
  'document': DocumentType(nullable=False),
168
168
  'separators': StringType(nullable=False),
169
- 'metadata': StringType(nullable=True),
169
+ 'metadata': StringType(nullable=False),
170
170
  'limit': IntType(nullable=True),
171
171
  'overlap': IntType(nullable=True),
172
172
  'skip_tags': StringType(nullable=True),
@@ -36,7 +36,7 @@ class Dumper:
36
36
  mock_home_dir = self.output_dir / '.pixeltable'
37
37
  mock_home_dir.mkdir(parents=True, exist_ok=True)
38
38
  os.environ['PIXELTABLE_HOME'] = str(mock_home_dir)
39
- os.environ['PIXELTABLE_CONFIG'] = str(shared_home / 'config.yaml')
39
+ os.environ['PIXELTABLE_CONFIG'] = str(shared_home / 'config.toml')
40
40
  os.environ['PIXELTABLE_DB'] = db_name
41
41
  os.environ['PIXELTABLE_PGDATA'] = str(shared_home / 'pgdata')
42
42
 
@@ -1,28 +1,33 @@
1
1
  from __future__ import annotations
2
- from typing import Optional, List, Tuple, Dict
3
- from collections import OrderedDict, defaultdict, namedtuple
4
- import os
2
+
5
3
  import glob
6
- from pathlib import Path
7
- from time import time
4
+ import hashlib
8
5
  import logging
6
+ import os
7
+ import warnings
8
+ from collections import OrderedDict, defaultdict, namedtuple
9
+ from dataclasses import dataclass
10
+ from datetime import datetime, timezone
11
+ from pathlib import Path
12
+ from typing import Optional
9
13
  from uuid import UUID
10
- import hashlib
11
14
 
15
+ import pixeltable.exceptions as excs
12
16
  from pixeltable.env import Env
13
17
 
14
-
15
18
  _logger = logging.getLogger('pixeltable')
16
19
 
20
+ @dataclass
17
21
  class CacheEntry:
18
- def __init__(self, key: str, tbl_id: UUID, col_id: int, size: int, last_accessed_ts: int, ext: str):
19
- self.key = key
20
- self.tbl_id = tbl_id
21
- self.col_id = col_id
22
- self.size = size
23
- self.last_accessed_ts = last_accessed_ts
24
- self.ext = ext
25
22
 
23
+ key: str
24
+ tbl_id: UUID
25
+ col_id: int
26
+ size: int
27
+ last_used: datetime
28
+ ext: str
29
+
30
+ @property
26
31
  def path(self) -> Path:
27
32
  return Env.get().file_cache_dir / f'{self.tbl_id.hex}_{self.col_id}_{self.key}{self.ext}'
28
33
 
@@ -34,7 +39,11 @@ class CacheEntry:
34
39
  col_id = int(components[1])
35
40
  key = components[2]
36
41
  file_info = os.stat(str(path))
37
- return cls(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
42
+ # We use the last modified time (file_info.st_mtime) as the timestamp; `FileCache` will touch the file
43
+ # each time it is retrieved, so that the mtime of the file will always represent the last used time of
44
+ # the cache entry.
45
+ last_used = datetime.fromtimestamp(file_info.st_mtime, tz=timezone.utc)
46
+ return cls(key, tbl_id, col_id, file_info.st_size, last_used, path.suffix)
38
47
 
39
48
 
40
49
  class FileCache:
@@ -45,31 +54,60 @@ class FileCache:
45
54
  access of a cache entries is its file's mtime.
46
55
 
47
56
  TODO:
48
- - enforce a maximum capacity with LRU eviction
49
57
  - implement MRU eviction for queries that exceed the capacity
50
58
  """
51
- _instance: Optional[FileCache] = None
52
- ColumnStats = namedtuple('FileCacheColumnStats', ['tbl_id', 'col_id', 'num_files', 'total_size'])
59
+ __instance: Optional[FileCache] = None
60
+
61
+ cache: OrderedDict[str, CacheEntry]
62
+ total_size: int
63
+ capacity_bytes: int
64
+ num_requests: int
65
+ num_hits: int
66
+ num_evictions: int
67
+ keys_retrieved: set[str] # keys retrieved (downloaded or accessed) this session
68
+ keys_evicted_after_retrieval: set[str] # keys that were evicted after having been retrieved this session
69
+
70
+ # A key is added to this set when it is already present in `keys_evicted_this_session` and is downloaded again.
71
+ # In other words, for a key to be added to this set, the following sequence of events must occur in this order:
72
+ # - It is retrieved during this session (either because it was newly downloaded, or because it was in the cache
73
+ # at the start of the session and was accessed at some point during the session)
74
+ # - It is subsequently evicted
75
+ # - It is subsequently retrieved a second time ("download after a previous retrieval")
76
+ # The contents of this set will be used to generate a more informative warning.
77
+ evicted_working_set_keys: set[str]
78
+ new_redownload_witnessed: bool # whether a new re-download has occurred since the last time a warning was issued
79
+
80
+ ColumnStats = namedtuple('FileCacheColumnStats', ('tbl_id', 'col_id', 'num_files', 'total_size'))
53
81
  CacheStats = namedtuple(
54
- 'FileCacheStats', ['total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats'])
82
+ 'FileCacheStats',
83
+ ('total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats')
84
+ )
55
85
 
56
86
  @classmethod
57
87
  def get(cls) -> FileCache:
58
- if cls._instance is None:
59
- cls._instance = cls()
60
- return cls._instance
88
+ if cls.__instance is None:
89
+ cls.init()
90
+ return cls.__instance
91
+
92
+ @classmethod
93
+ def init(cls) -> None:
94
+ cls.__instance = cls()
61
95
 
62
96
  def __init__(self):
63
- self.cache: OrderedDict[str, CacheEntry] = OrderedDict() # ordered by entry.last_accessed_ts
97
+ self.cache = OrderedDict()
64
98
  self.total_size = 0
65
- #self.capacity = Env.get().max_filecache_size
99
+ self.capacity_bytes = Env.get()._file_cache_size_g * (1 << 30)
66
100
  self.num_requests = 0
67
101
  self.num_hits = 0
68
102
  self.num_evictions = 0
103
+ self.keys_retrieved = set()
104
+ self.keys_evicted_after_retrieval = set()
105
+ self.evicted_working_set_keys = set()
106
+ self.new_redownload_witnessed = False
69
107
  paths = glob.glob(str(Env.get().file_cache_dir / '*'))
70
108
  entries = [CacheEntry.from_file(Path(path_str)) for path_str in paths]
71
- # we need to insert entries in order of last_accessed_ts
72
- entries.sort(key=lambda e: e.last_accessed_ts)
109
+ # we need to insert entries in access order
110
+ entries.sort(key=lambda e: e.last_used)
73
111
  for entry in entries:
74
112
  self.cache[entry.key] = entry
75
113
  self.total_size += entry.size
@@ -82,30 +120,43 @@ class FileCache:
82
120
  def num_files(self, tbl_id: Optional[UUID] = None) -> int:
83
121
  if tbl_id is None:
84
122
  return len(self.cache)
85
- entries = [e for e in self.cache.values() if e.tbl_id == tbl_id]
86
- return len(entries)
123
+ return sum(e.tbl_id == tbl_id for e in self.cache.values())
87
124
 
88
- def clear(self, tbl_id: Optional[UUID] = None, capacity: Optional[int] = None) -> None:
125
+ def clear(self, tbl_id: Optional[UUID] = None) -> None:
89
126
  """
90
127
  For testing purposes: allow resetting capacity and stats.
91
128
  """
92
- self.num_requests, self.num_hits, self.num_evictions = 0, 0, 0
93
- entries = list(self.cache.values()) # list(): avoid dealing with values() return type
94
- if tbl_id is not None:
95
- entries = [e for e in entries if e.tbl_id == tbl_id]
96
- _logger.debug(f'clearing {len(entries)} entries from file cache for table {tbl_id}')
129
+ if tbl_id is None:
130
+ # We need to store the entries to remove in a list, because we can't remove items from a dict while iterating
131
+ entries_to_remove = list(self.cache.values())
132
+ _logger.debug(f'clearing {self.num_files()} entries from file cache')
133
+ self.num_requests, self.num_hits, self.num_evictions = 0, 0, 0
134
+ self.keys_retrieved.clear()
135
+ self.keys_evicted_after_retrieval.clear()
136
+ self.new_redownload_witnessed = False
97
137
  else:
98
- _logger.debug(f'clearing {len(entries)} entries from file cache')
99
- for entry in entries:
138
+ entries_to_remove = [e for e in self.cache.values() if e.tbl_id == tbl_id]
139
+ _logger.debug(f'clearing {self.num_files(tbl_id)} entries from file cache for table {tbl_id}')
140
+ for entry in entries_to_remove:
141
+ os.remove(entry.path)
100
142
  del self.cache[entry.key]
101
143
  self.total_size -= entry.size
102
- os.remove(entry.path())
103
- # if capacity is not None:
104
- # self.capacity = capacity
105
- # else:
106
- # # need to reset to default
107
- # self.capacity = Env.get().max_filecache_size
108
- # _logger.debug(f'setting file cache capacity to {self.capacity}')
144
+
145
+ def emit_eviction_warnings(self) -> None:
146
+ if self.new_redownload_witnessed:
147
+ # Compute the additional capacity that would be needed in order to retain all the re-downloaded files
148
+ extra_capacity_needed = sum(self.cache[key].size for key in self.evicted_working_set_keys)
149
+ suggested_cache_size = self.capacity_bytes + extra_capacity_needed + (1 << 30)
150
+ warnings.warn(
151
+ f'{len(self.evicted_working_set_keys)} media file(s) had to be downloaded multiple times this session, '
152
+ 'because they were evicted\nfrom the file cache after their first access. The total size '
153
+ f'of the evicted file(s) is {round(extra_capacity_needed / (1 << 30), 1)} GiB.\n'
154
+ f'Consider increasing the cache size to at least {round(suggested_cache_size / (1 << 30), 1)} GiB '
155
+ f'(it is currently {round(self.capacity_bytes / (1 << 30), 1)} GiB).\n'
156
+ f'You can do this by setting the value of `file_cache_size_g` in: {str(Env.get()._config_file)}',
157
+ excs.PixeltableWarning
158
+ )
159
+ self.new_redownload_witnessed = False
109
160
 
110
161
  def _url_hash(self, url: str) -> str:
111
162
  h = hashlib.sha256()
@@ -120,66 +171,62 @@ class FileCache:
120
171
  _logger.debug(f'file cache miss for {url}')
121
172
  return None
122
173
  # update mtime and cache
123
- path = entry.path()
174
+ path = entry.path
124
175
  path.touch(exist_ok=True)
125
176
  file_info = os.stat(str(path))
126
- entry.last_accessed_ts = file_info.st_mtime
177
+ entry.last_used = file_info.st_mtime
127
178
  self.cache.move_to_end(key, last=True)
128
179
  self.num_hits += 1
180
+ self.keys_retrieved.add(key)
129
181
  _logger.debug(f'file cache hit for {url}')
130
182
  return path
131
183
 
132
- # def can_admit(self, query_ts: int) -> bool:
133
- # if self.total_size + self.avg_file_size <= self.capacity:
134
- # return True
135
- # assert len(self.cache) > 0
136
- # # check whether we can evict the current lru entry
137
- # lru_entry = next(iter(self.cache.values()))
138
- # if lru_entry.last_accessed_ts >= query_ts:
139
- # # the current query brought this entry in: we're not going to evict it
140
- # return False
141
- # return True
142
-
143
184
  def add(self, tbl_id: UUID, col_id: int, url: str, path: Path) -> Path:
144
185
  """Adds url at 'path' to cache and returns its new path.
145
186
  'path' will not be accessible after this call. Retains the extension of 'path'.
146
187
  """
147
188
  file_info = os.stat(str(path))
148
- _ = time()
149
- #if self.total_size + file_info.st_size > self.capacity:
150
- if False:
151
- if len(self.cache) == 0:
152
- # nothing to evict
153
- return
154
- # evict entries until we're below the limit or until we run into entries the current query brought in
155
- while True:
156
- lru_entry = next(iter(self.cache.values()))
157
- if lru_entry.last_accessed_ts >= query_ts:
158
- # the current query brought this entry in: switch to MRU and ignore this put()
159
- _logger.debug('file cache switched to MRU')
160
- return
161
- self.cache.popitem(last=False)
162
- self.total_size -= lru_entry.size
163
- self.num_evictions += 1
164
- os.remove(str(lru_entry.path()))
165
- _logger.debug(f'evicted entry for cell {lru_entry.cell_id} from file cache')
166
- if self.total_size + file_info.st_size <= self.capacity:
167
- break
168
-
189
+ self.ensure_capacity(file_info.st_size)
169
190
  key = self._url_hash(url)
170
191
  assert key not in self.cache
192
+ if key in self.keys_evicted_after_retrieval:
193
+ # This key was evicted after being retrieved earlier this session, and is now being retrieved again.
194
+ # Add it to `keys_multiply_downloaded` so that we may generate a warning later.
195
+ self.evicted_working_set_keys.add(key)
196
+ self.new_redownload_witnessed = True
197
+ self.keys_retrieved.add(key)
171
198
  entry = CacheEntry(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
172
199
  self.cache[key] = entry
173
200
  self.total_size += entry.size
174
- new_path = entry.path()
201
+ new_path = entry.path
175
202
  os.rename(str(path), str(new_path))
203
+ new_path.touch(exist_ok=True)
176
204
  _logger.debug(f'added entry for cell {url} to file cache')
177
205
  return new_path
178
206
 
207
+ def ensure_capacity(self, size: int) -> None:
208
+ """
209
+ Evict entries from the cache until there is at least 'size' bytes of free space.
210
+ """
211
+ while len(self.cache) > 0 and self.total_size + size > self.capacity_bytes:
212
+ _, lru_entry = self.cache.popitem(last=False)
213
+ self.total_size -= lru_entry.size
214
+ self.num_evictions += 1
215
+ if lru_entry.key in self.keys_retrieved:
216
+ # This key was retrieved at some point earlier this session and is now being evicted.
217
+ # Make a record of the eviction, so that we can generate a warning later if the key is retrieved again.
218
+ self.keys_evicted_after_retrieval.add(lru_entry.key)
219
+ os.remove(str(lru_entry.path))
220
+ _logger.debug(f'evicted entry for cell {lru_entry.key} from file cache (of size {lru_entry.size // (1 << 20)} MiB)')
221
+
222
+ def set_capacity(self, capacity_bytes: int) -> None:
223
+ self.capacity_bytes = capacity_bytes
224
+ self.ensure_capacity(0) # evict entries if necessary
225
+
179
226
  def stats(self) -> CacheStats:
180
227
  # collect column stats
181
228
  # (tbl_id, col_id) -> (num_files, total_size)
182
- d: Dict[Tuple[int, int], List[int]] = defaultdict(lambda: [0, 0])
229
+ d: dict[tuple[int, int], list[int]] = defaultdict(lambda: [0, 0])
183
230
  for entry in self.cache.values():
184
231
  t = d[(entry.tbl_id, entry.col_id)]
185
232
  t[0] += 1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pixeltable
3
- Version: 0.2.19
3
+ Version: 0.2.20
4
4
  Summary: Pixeltable: The Multimodal AI Data Plane
5
5
  Author: Pixeltable, Inc.
6
6
  Author-email: contact@pixeltable.com
@@ -31,6 +31,7 @@ Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
31
31
  Requires-Dist: requests (>=2.31.0,<3.0.0)
32
32
  Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0)
33
33
  Requires-Dist: tenacity (>=8.2,<9.0)
34
+ Requires-Dist: toml (>=0.10)
34
35
  Requires-Dist: tqdm (>=4.64)
35
36
  Description-Content-Type: text/markdown
36
37
 
@@ -46,10 +47,17 @@ Description-Content-Type: text/markdown
46
47
  [![tests status](https://github.com/pixeltable/pixeltable/actions/workflows/nightly.yml/badge.svg)](https://github.com/pixeltable/pixeltable/actions/workflows/nightly.yml)
47
48
  [![PyPI Package](https://img.shields.io/pypi/v/pixeltable?color=darkorange)](https://pypi.org/project/pixeltable/)
48
49
 
49
- [Installation](https://pixeltable.github.io/pixeltable/getting-started/) | [Documentation](https://pixeltable.readme.io/) | [API Reference](https://pixeltable.github.io/pixeltable/) | [Code Samples](https://pixeltable.readme.io/recipes) | [Examples](https://github.com/pixeltable/pixeltable/tree/release/docs/release/tutorials)
50
+ [Installation](https://pixeltable.github.io/pixeltable/getting-started/) | [Documentation](https://pixeltable.readme.io/) | [API Reference](https://pixeltable.github.io/pixeltable/) | [Code Samples](https://github.com/pixeltable/pixeltable?tab=readme-ov-file#-code-samples) | [Computer Vision](https://docs.pixeltable.com/docs/object-detection-in-videos) | [LLM](https://docs.pixeltable.com/docs/document-indexing-and-rag)
50
51
  </div>
51
52
 
52
- Pixeltable is a Python library providing a declarative interface for multimodal data (text, images, audio, video). It features built-in versioning, lineage tracking, and incremental updates, enabling users to store, transform, index, and iterate on data for their ML workflows. Data transformations, model inference, and custom logic are embedded as computed columns.
53
+ Pixeltable is a Python library providing a declarative interface for multimodal data (text, images, audio, video). It features built-in versioning, lineage tracking, and incremental updates, enabling users to **store**, **transform**, **index**, and **iterate** on data for their ML workflows.
54
+
55
+ Data transformations, model inference, and custom logic are embedded as **computed columns**.
56
+ - **Load/Query all data types**: Interact with [video data](https://github.com/pixeltable/pixeltable?tab=readme-ov-file#import-media-data-into-pixeltable-videos-images-audio) at the [frame level](https://github.com/pixeltable/pixeltable?tab=readme-ov-file#text-and-image-similarity-search-on-video-frames-with-embedding-indexes) and documents at the [chunk level](https://github.com/pixeltable/pixeltable?tab=readme-ov-file#automate-data-operations-with-views-eg-split-documents-into-chunks)
57
+ - **Incremental updates for data transformation**: Maintain an [embedding index](https://docs.pixeltable.com/docs/embedding-vector-indexes) colocated with your data
58
+ - **Lazy evaluation and cache management**: Eliminates the need for [manual frame extraction](https://docs.pixeltable.com/docs/object-detection-in-videos)
59
+ - **Integrates with any Python libraries**: Use [built-in and custom functions (UDFs)](https://docs.pixeltable.com/docs/user-defined-functions-udfs) without complex pipelines
60
+ - **Data format agnostic and extensibility**: Access tables as Parquet files, [PyTorch datasets](https://pixeltable.github.io/pixeltable/api/data-frame/#pixeltable.DataFrame.to_pytorch_dataset), or [COCO annotations](https://pixeltable.github.io/pixeltable/api/table/#pixeltable.Table.to_coco_dataset)
53
61
 
54
62
  ## 💾 Installation
55
63
 
@@ -1,26 +1,26 @@
1
1
  pixeltable/__init__.py,sha256=t1uRHKta7mPH9_KgkUpOWBu6AewA7DRdSGGyrm0OcSQ,1279
2
- pixeltable/__version__.py,sha256=_o-1q05Ttbs4cHNYgDuwFGf4i38fZkI3gdZZEbf8-co,114
2
+ pixeltable/__version__.py,sha256=RrlUQ8lgLgO05DWPZnMK9eYJ0O2bfE8N7RBzpECP5f8,114
3
3
  pixeltable/catalog/__init__.py,sha256=E41bxaPeQIcgRYzTWc2vkDOboQhRymrJf4IcHQO7o_8,453
4
4
  pixeltable/catalog/catalog.py,sha256=tyDyI5wQw7vV6_FChrp9qgGCRClcjiSdW3eygYT0p9s,7849
5
5
  pixeltable/catalog/column.py,sha256=Be3WmOadMROS2s4IgtG_Ohjkr07eU9GJItl6WhNishQ,9683
6
6
  pixeltable/catalog/dir.py,sha256=fG_BQM-fLuABpTstMVH-9dvZPx7kqi3sgTQgKveVXJI,922
7
7
  pixeltable/catalog/globals.py,sha256=XeOeDqq1nDEcpqkY7PYBosoL6tXVAfkJSLJN9aQ_9Fg,1850
8
- pixeltable/catalog/insertable_table.py,sha256=XpA87IoYtEe1xVbbahYRowhGq3fHHc4M9GpIViggNCU,6724
8
+ pixeltable/catalog/insertable_table.py,sha256=DFL93x8ihYEnK_yCR8EdYHDQqAomJdU11ygTF0jEFWY,6822
9
9
  pixeltable/catalog/named_function.py,sha256=W8vikP_3jMJ9pQQsksO2EfQAlaVxuQHBlo65M4924dc,1150
10
10
  pixeltable/catalog/path.py,sha256=QgccEi_QOfaKt8YsR2zLtd_z7z7QQkU_1kprJFi2SPQ,1677
11
11
  pixeltable/catalog/path_dict.py,sha256=4b9_Ax7Q8tkmoCYPaKNedpQkU17pE0oGDd2XB53eNZA,5979
12
12
  pixeltable/catalog/schema_object.py,sha256=qhpeeUPOYT5doDbsyUNBcPm5QzAQPCAsikqh1PQ6d1k,2226
13
- pixeltable/catalog/table.py,sha256=eVGCXZxtEaG8Y4IEe-8IgPtIFAaUsycJqMdVE_WpKPA,40811
13
+ pixeltable/catalog/table.py,sha256=NQMZwG6wPu8DzJmZLXTkDm_Dth0AmNXhcixNqiXlPuc,41307
14
14
  pixeltable/catalog/table_version.py,sha256=4_djeYLGu9ljRSXe_f14c3HvXL0o0P2-sOZ-1bBQzYw,56991
15
15
  pixeltable/catalog/table_version_path.py,sha256=Ee6nPh5Jgbp91qFSKkCwdzIpQ3gJqv3SG06bFFLhbBE,6139
16
16
  pixeltable/catalog/view.py,sha256=RfQRldjPUZ7W8jMMdXJFSjbjCUe-3ynxDFvg4W27qXc,10642
17
17
  pixeltable/dataframe.py,sha256=kAPv9YjOEx0xZViFG3fi6eXsX6zUhm3F2x5U7qDOrJU,34378
18
- pixeltable/env.py,sha256=fAxc0vXRN_OkooqepIbHQJPyoY2aQwWmfxYmFCq-8pc,27374
19
- pixeltable/exceptions.py,sha256=pgMRe11dqUdSWaTssib4bXj9R_HNqwhdqKSnMsRkSvE,422
18
+ pixeltable/env.py,sha256=XHxv2P5Aj1dvUxwlvfoxahVemhJFyapmW7p3pf8Vq7g,30133
19
+ pixeltable/exceptions.py,sha256=NuFY2WtkQpLfLHT_J70kOw9Tr0kEDkkgo-u7As4Gaq4,410
20
20
  pixeltable/exec/__init__.py,sha256=VRENEONsAv3PPoBV0r7h-7nAB7SWM4Uglmu1FVQE5uQ,507
21
21
  pixeltable/exec/aggregation_node.py,sha256=-DunTLlVh3OflpwTIjkwKGczotl4i3oUqrvfyvRjv6Q,3452
22
22
  pixeltable/exec/cache_prefetch_node.py,sha256=d5pEuR6AtJQkEVy9X3XeYFI_q0szMtoNAH96vYdtBE0,5241
23
- pixeltable/exec/component_iteration_node.py,sha256=d5XHahyq-cosYW6BaIeFLuU8zNRmDL8eGtiN_eYY8ZM,4068
23
+ pixeltable/exec/component_iteration_node.py,sha256=ABuXGbDRQWLGuaBnfK7bvOxCrz81vMMiAvXHHI8SX4c,4930
24
24
  pixeltable/exec/data_row_batch.py,sha256=1IDYHBkSQ60dwOnAGnS-Wpp3AsnbMqKcY40zUT7ku-Q,3392
25
25
  pixeltable/exec/exec_context.py,sha256=0rg5V8HzSy-BvqmSbGr-U4aJ4eOZg2JN0x6zjYQGtBc,1090
26
26
  pixeltable/exec/exec_node.py,sha256=ixkv3p_EfF53UDWgwLjQGKR1LNIQxzgDXsTzzJj6ea4,2211
@@ -45,7 +45,7 @@ pixeltable/exprs/in_predicate.py,sha256=vJwT07SlDXBYMbqpf-dgV2gr6je5DehrpkPBapnZ
45
45
  pixeltable/exprs/inline_expr.py,sha256=FIQsgwfz-9qmghnaTSTL3522Mhr9GQUKM_SDxzA4P5w,7055
46
46
  pixeltable/exprs/is_null.py,sha256=qkzxr0NPuID77gs-J_tXj0MYuoCPBEd3Iq6MUWJ_dSc,1101
47
47
  pixeltable/exprs/json_mapper.py,sha256=grr-9xVOU_TUL1wtON7wNqZ10-p3mGp66cTCofQKkqc,4590
48
- pixeltable/exprs/json_path.py,sha256=cdAA73af_ZK2hZAzROA9fpP8dTBrHYzcTi4APgtJNdk,7051
48
+ pixeltable/exprs/json_path.py,sha256=xlwUeYL8D--dPTMhzoyCtkQVeik0sfwI7k_XlNs0eS4,6912
49
49
  pixeltable/exprs/literal.py,sha256=ofhMe2kiT4tWNuzf2zKOiGY5pml10dRqbV0e9HGVcbs,3780
50
50
  pixeltable/exprs/method_ref.py,sha256=6TQnl5JhsUqKNPFUbu2tzu5svF_BZf5rfm2cZo740Ts,2600
51
51
  pixeltable/exprs/object_ref.py,sha256=UDLfpFXrOTrYZOVWH6G5dx4Ax_BxFTpLOaIab3MuyyI,1282
@@ -81,26 +81,26 @@ pixeltable/functions/mistralai.py,sha256=U7f6g4EyHMsik8HMIdJIKn6xFSCdQH6950AAOYL
81
81
  pixeltable/functions/openai.py,sha256=yr2hgUa0ZtUJOezSC9aVqp-BoxADf-gmYoK8FE2jbVU,15930
82
82
  pixeltable/functions/string.py,sha256=RCGj5bXx7MWgcdcOuy1IMTn3vBvGzjgxudyUrDqWdAg,20153
83
83
  pixeltable/functions/timestamp.py,sha256=lyWPv2sCpejD2t9DB62nxJEm0kWLNsAW8yMiT5iEsOo,9121
84
- pixeltable/functions/together.py,sha256=h2FqZomgLhTkpwJ-t1mFgbBrFPbNzKE4CLlqTEc-qwY,8947
84
+ pixeltable/functions/together.py,sha256=pmd_Xo9XaJ9M8-Zx1bDb4pnomHGZ5swBENHYx-uhmPs,9480
85
85
  pixeltable/functions/util.py,sha256=F2iiIL7UfhYdCVzdCa3efYqWbaeLKFrbycKnuPkG57M,650
86
86
  pixeltable/functions/video.py,sha256=qaPkeU4qO_g_lQhiMcytAOiJbwtfO89amGVxsT86MZQ,7180
87
87
  pixeltable/functions/vision.py,sha256=K_E1Q-n2plPuFoOPlbKWRMiJp9dPgftIJ2T_o3TNL3I,15594
88
88
  pixeltable/functions/whisper.py,sha256=VvGVWEsANHH2oCabT1bFTXoDEn5g90gQT_PCh56W4n4,3377
89
- pixeltable/globals.py,sha256=Jc9XgtCUTjM7O2EJLnrDXkjQL--ROSMlRl5MBP7ndC8,16661
89
+ pixeltable/globals.py,sha256=dbLCAuobQAJgjlTASp9bGRLwOYEyBntKLl3-GP7GTgU,16755
90
90
  pixeltable/index/__init__.py,sha256=XBwetNQQwnz0fiKwonOKhyy_U32l_cjt77kNvEIdjWs,102
91
91
  pixeltable/index/base.py,sha256=YAQ5Dz1mfI0dfu9rxWHWroE8TjB90yKfPtXAzoADq38,1568
92
92
  pixeltable/index/btree.py,sha256=NE4GYhcJWYJhdKyeHI0sQBlFvUaIgGOF9KLyCZOfFjE,1822
93
93
  pixeltable/index/embedding_index.py,sha256=U1wAjcTYvw3uJf3QHIOzBV8FLOUn8IeaFsLzUb_QTmc,7829
94
94
  pixeltable/io/__init__.py,sha256=bJGWPhKfgoMrSKFdXhLGupjQQbIXt7JaoUPwilND2PE,519
95
95
  pixeltable/io/external_store.py,sha256=iRqvMx9QuCKmOKBe12hoY1KfXyGvDHL-q1CjaZr3Fkk,16466
96
- pixeltable/io/globals.py,sha256=3kgeSFX6kFXGFMs9GpmLlU1sY9aZrfddMhVrQEOlJIs,13330
97
- pixeltable/io/hf_datasets.py,sha256=mVZYvjh7DVCmyKSQNnsJ8oLspSQw2WRnSm65F_pzf-A,8255
96
+ pixeltable/io/globals.py,sha256=ZmjbLy9EMhJjXKeNVgPhsi1dmllwJ1rsHu1XHadHtgM,13330
97
+ pixeltable/io/hf_datasets.py,sha256=E5E2yfaHo9Hf9gFI9ZhzaztHtRC_xDL6mIyeeond2Uo,8284
98
98
  pixeltable/io/label_studio.py,sha256=m1-ayI7S8Lxv2R1agrO-32xXyB8Z-YPP_ErAqu22c7o,31023
99
99
  pixeltable/io/pandas.py,sha256=7eHg7wnAfRA9eBk4iC0iSSVTKOM59Ne4pXokKWdt3dY,9793
100
100
  pixeltable/io/parquet.py,sha256=bUBJmnTFrlBZ8yIesqUJ1JufXZ76pm7vQ3Fq48hVijA,7853
101
101
  pixeltable/iterators/__init__.py,sha256=sjldFckkT8aVRiKgEP6faeAK2NQBdzbmpwAeRhI1FkM,366
102
102
  pixeltable/iterators/base.py,sha256=cnEh1tNN2JAxRzrLTg3dhun3N1oNQ8vifCm6ts3_UiE,1687
103
- pixeltable/iterators/document.py,sha256=dz7bfukocBS_2xBTD4gBS8EMaUP5fnkmQL-kWGxqdTA,19454
103
+ pixeltable/iterators/document.py,sha256=dAJjCRY0HUxrdMlGjf19ZLChARmWonYoJ0QvgfxkQyQ,19455
104
104
  pixeltable/iterators/string.py,sha256=NG_fWc_GAITDfzl6MvrDOMrSoMcZdMZf6hPQztCSatE,1305
105
105
  pixeltable/iterators/video.py,sha256=Glp7qNjyrH8X5S4WJOEsZhCa4yChalTICiR9bbMsHlo,5734
106
106
  pixeltable/metadata/__init__.py,sha256=8mYxCsc_uvN3tqwrmIbB9iBkQ9r9ybsdpFCMsrzNaNw,2172
@@ -119,7 +119,7 @@ pixeltable/metadata/notes.py,sha256=1Hk6TGy69a4jgqqLoaUlQPtzANMvMGkifKC5rjqeOeA,
119
119
  pixeltable/metadata/schema.py,sha256=H2NjpNBxZNDw_VV3UK97fKs30dh81uQf8F3vexKeePo,8567
120
120
  pixeltable/plan.py,sha256=pHTJxv2WzsDXtnBd9RvXtUnEFvIQjXV7NX_BIQRQiHs,38544
121
121
  pixeltable/store.py,sha256=zlVG9rs5k0k8wcfYF2jcgAQgIOfanJ9YjIDs_kacRIQ,21106
122
- pixeltable/tool/create_test_db_dump.py,sha256=YpVGwe6mn22p1sEGbXcEpOyF0RQwKgd3gCRA0Wl2phY,11980
122
+ pixeltable/tool/create_test_db_dump.py,sha256=y4LotPVbcQeqnarpISmVPWoURBVnjKjSl9Yi2MmCZE0,11980
123
123
  pixeltable/tool/create_test_video.py,sha256=OLfccymYReIpzE8osZn4rQvLXxxiPC_l0vc06U74hVM,2899
124
124
  pixeltable/tool/doc_plugins/griffe.py,sha256=Q6ARBlQNBm8J21G_p625TB5c8MQ8r6hJlm7I2LoBon0,3422
125
125
  pixeltable/tool/doc_plugins/mkdocstrings.py,sha256=afq7XOaSC5WRmugkh-FMFMK8PqOgIlDIsJdD8cuPhtE,207
@@ -131,7 +131,7 @@ pixeltable/utils/arrow.py,sha256=83_7aG5UR2qtTktw_otLkQs-RQbLk0VVM0JLJkbweNU,369
131
131
  pixeltable/utils/coco.py,sha256=ISpFBhR4eO1jOcg_SPb0thVI4KdS6H0RyNQauZIA5A4,7287
132
132
  pixeltable/utils/code.py,sha256=AOw1u2r8_DQXpX-lxJhyHWARGrCRDXOJHFVgKOi54Uc,1231
133
133
  pixeltable/utils/documents.py,sha256=UQq2F-W4stDuldFDSGHwUe5PK1dPoalN8SfYRoGqd14,2038
134
- pixeltable/utils/filecache.py,sha256=UoNONG2VaAc2IBB0e3sQdsvyOPOes2XSDc5_CsA4qek,7839
134
+ pixeltable/utils/filecache.py,sha256=hQOSz5VmC2MBk0F4RaZKgG1OQFFXOyFfanp-cQMOsU4,10553
135
135
  pixeltable/utils/formatter.py,sha256=XOuNAhZKCvA9Dlj1QYHB_ovwWUuznvvvdkWgjl4bWq0,9239
136
136
  pixeltable/utils/help.py,sha256=cCnxJ4VP9MJ57iDqExmnDcM-JG3a1lw_q7g-D7bpSVI,252
137
137
  pixeltable/utils/http_server.py,sha256=WQ5ILMzlz4TlwI9j5YqAPgEZyhrN1GytMNDbLD9occk,2422
@@ -140,8 +140,8 @@ pixeltable/utils/pytorch.py,sha256=VWczSB_FT_aOU5Xqv4T5ONTsnQN6KDlZmMkuoBuji08,3
140
140
  pixeltable/utils/s3.py,sha256=DBfXp0SYubhiKckdAD7PsiVBX_YfVP8Rcu6DCG_3SaQ,433
141
141
  pixeltable/utils/sql.py,sha256=5n5_OmXAGtqFdL6z5XvgnU-vlx6Ba6f1WJrO1ZwUle8,765
142
142
  pixeltable/utils/transactional_directory.py,sha256=UGzCrGtLR3hEEf8sYGuWBzLVFAEQml3vdIavigWeTBM,1349
143
- pixeltable-0.2.19.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
144
- pixeltable-0.2.19.dist-info/METADATA,sha256=elg8Pycf1lqUyJVyCbPHjWNXQqHqCKei_wRlKSI6A_o,13538
145
- pixeltable-0.2.19.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
146
- pixeltable-0.2.19.dist-info/entry_points.txt,sha256=TNI1Gb5vPwFrTdw6TimSYjO8FeK8c_HuPr28vcf7o_I,108
147
- pixeltable-0.2.19.dist-info/RECORD,,
143
+ pixeltable-0.2.20.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
144
+ pixeltable-0.2.20.dist-info/METADATA,sha256=QYhIK4U4RMLo_B3lTevJoPXFTM3hP8qfqO1A89R9Qjo,14972
145
+ pixeltable-0.2.20.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
146
+ pixeltable-0.2.20.dist-info/entry_points.txt,sha256=TNI1Gb5vPwFrTdw6TimSYjO8FeK8c_HuPr28vcf7o_I,108
147
+ pixeltable-0.2.20.dist-info/RECORD,,