pixeltable 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (42) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/insertable_table.py +9 -7
  4. pixeltable/catalog/table.py +18 -5
  5. pixeltable/catalog/table_version.py +1 -1
  6. pixeltable/catalog/view.py +1 -1
  7. pixeltable/dataframe.py +1 -1
  8. pixeltable/env.py +140 -40
  9. pixeltable/exceptions.py +12 -5
  10. pixeltable/exec/component_iteration_node.py +63 -42
  11. pixeltable/exprs/__init__.py +1 -2
  12. pixeltable/exprs/expr.py +5 -6
  13. pixeltable/exprs/function_call.py +8 -10
  14. pixeltable/exprs/inline_expr.py +200 -0
  15. pixeltable/exprs/json_path.py +3 -6
  16. pixeltable/ext/functions/whisperx.py +2 -0
  17. pixeltable/ext/functions/yolox.py +5 -3
  18. pixeltable/functions/huggingface.py +89 -12
  19. pixeltable/functions/image.py +3 -3
  20. pixeltable/functions/together.py +37 -16
  21. pixeltable/functions/vision.py +43 -21
  22. pixeltable/functions/whisper.py +3 -0
  23. pixeltable/globals.py +7 -1
  24. pixeltable/io/globals.py +1 -1
  25. pixeltable/io/hf_datasets.py +3 -3
  26. pixeltable/iterators/document.py +1 -1
  27. pixeltable/metadata/__init__.py +1 -1
  28. pixeltable/metadata/converters/convert_18.py +1 -1
  29. pixeltable/metadata/converters/convert_20.py +56 -0
  30. pixeltable/metadata/converters/util.py +29 -4
  31. pixeltable/metadata/notes.py +1 -0
  32. pixeltable/tool/create_test_db_dump.py +15 -4
  33. pixeltable/type_system.py +3 -1
  34. pixeltable/utils/filecache.py +126 -79
  35. pixeltable-0.2.20.dist-info/LICENSE +201 -0
  36. {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/METADATA +16 -6
  37. {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/RECORD +39 -39
  38. pixeltable/exprs/inline_array.py +0 -117
  39. pixeltable/exprs/inline_dict.py +0 -104
  40. pixeltable-0.2.18.dist-info/LICENSE +0 -18
  41. {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/WHEEL +0 -0
  42. {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/entry_points.txt +0 -0
pixeltable/__init__.py CHANGED
@@ -4,7 +4,7 @@ from .exceptions import Error
4
4
  from .exprs import RELATIVE_PATH_ROOT
5
5
  from .func import Function, udf, Aggregator, uda, expr_udf
6
6
  from .globals import init, create_table, create_view, get_table, move, drop_table, list_tables, create_dir, drop_dir, \
7
- list_dirs, list_functions, configure_logging
7
+ list_dirs, list_functions, configure_logging, array
8
8
  from .type_system import (
9
9
  ColumnType,
10
10
  StringType,
pixeltable/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  # These version placeholders will be replaced during build.
2
- __version__ = "0.2.18"
3
- __version_tuple__ = (0, 2, 18)
2
+ __version__ = "0.2.20"
3
+ __version_tuple__ = (0, 2, 20)
@@ -10,6 +10,7 @@ import pixeltable as pxt
10
10
  import pixeltable.type_system as ts
11
11
  from pixeltable import exceptions as excs
12
12
  from pixeltable.env import Env
13
+ from pixeltable.utils.filecache import FileCache
13
14
 
14
15
  from .catalog import Catalog
15
16
  from .globals import UpdateStatus
@@ -101,21 +102,22 @@ class InsertableTable(Table):
101
102
  if not isinstance(row, dict):
102
103
  raise excs.Error('rows must be a list of dictionaries')
103
104
  self._validate_input_rows(rows)
104
- result = self._tbl_version.insert(rows, None, print_stats=print_stats, fail_on_exception=fail_on_exception)
105
+ status = self._tbl_version.insert(rows, None, print_stats=print_stats, fail_on_exception=fail_on_exception)
105
106
 
106
- if result.num_excs == 0:
107
+ if status.num_excs == 0:
107
108
  cols_with_excs_str = ''
108
109
  else:
109
110
  cols_with_excs_str = \
110
- f' across {len(result.cols_with_excs)} column{"" if len(result.cols_with_excs) == 1 else "s"}'
111
- cols_with_excs_str += f' ({", ".join(result.cols_with_excs)})'
111
+ f' across {len(status.cols_with_excs)} column{"" if len(status.cols_with_excs) == 1 else "s"}'
112
+ cols_with_excs_str += f' ({", ".join(status.cols_with_excs)})'
112
113
  msg = (
113
- f'Inserted {result.num_rows} row{"" if result.num_rows == 1 else "s"} '
114
- f'with {result.num_excs} error{"" if result.num_excs == 1 else "s"}{cols_with_excs_str}.'
114
+ f'Inserted {status.num_rows} row{"" if status.num_rows == 1 else "s"} '
115
+ f'with {status.num_excs} error{"" if status.num_excs == 1 else "s"}{cols_with_excs_str}.'
115
116
  )
116
117
  print(msg)
117
118
  _logger.info(f'InsertableTable {self._name}: {msg}')
118
- return result
119
+ FileCache.get().emit_eviction_warnings()
120
+ return status
119
121
 
120
122
  def _validate_input_rows(self, rows: List[Dict[str, Any]]) -> None:
121
123
  """Verify that the input rows match the table schema"""
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  import abc
4
4
  import builtins
5
- import itertools
6
5
  import json
7
6
  import logging
8
7
  from pathlib import Path
@@ -21,6 +20,7 @@ import pixeltable.exprs as exprs
21
20
  import pixeltable.index as index
22
21
  import pixeltable.metadata.schema as schema
23
22
  import pixeltable.type_system as ts
23
+ from pixeltable.utils.filecache import FileCache
24
24
 
25
25
  from .column import Column
26
26
  from .globals import _ROWID_COLUMN_NAME, UpdateStatus, is_system_column_name, is_valid_identifier
@@ -34,7 +34,12 @@ if TYPE_CHECKING:
34
34
  _logger = logging.getLogger('pixeltable')
35
35
 
36
36
  class Table(SchemaObject):
37
- """Base class for table objects (base tables, views, snapshots)."""
37
+ """
38
+ Base class for table objects (base tables, views, snapshots).
39
+
40
+ Every user-invoked operation that runs an ExecNode tree (directly or indirectly) needs to call
41
+ FileCache.emit_eviction_warnings() at the end of the operation.
42
+ """
38
43
 
39
44
  def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath):
40
45
  super().__init__(id, name, dir_id)
@@ -375,7 +380,10 @@ class Table(SchemaObject):
375
380
 
376
381
  new_col = self._create_columns({col_name: col_schema})[0]
377
382
  self._verify_column(new_col, set(self._schema.keys()), set(self._query_names))
378
- return self._tbl_version.add_column(new_col, print_stats=print_stats)
383
+ status = self._tbl_version.add_column(new_col, print_stats=print_stats)
384
+ FileCache.get().emit_eviction_warnings()
385
+ return status
386
+
379
387
 
380
388
  @classmethod
381
389
  def _validate_column_spec(cls, name: str, spec: dict[str, Any]) -> None:
@@ -588,6 +596,7 @@ class Table(SchemaObject):
588
596
  idx = EmbeddingIndex(col, metric=metric, string_embed=string_embed, image_embed=image_embed)
589
597
  status = self._tbl_version.add_index(col, idx_name=idx_name, idx=idx)
590
598
  # TODO: how to deal with exceptions here? drop the index and raise?
599
+ FileCache.get().emit_eviction_warnings()
591
600
 
592
601
  def drop_embedding_index(self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None) -> None:
593
602
  """Drop an embedding index from the table.
@@ -733,7 +742,9 @@ class Table(SchemaObject):
733
742
  >>> tbl.update({'int_col': tbl.int_col + 1}, where=tbl.int_col == 0)
734
743
  """
735
744
  self._check_is_dropped()
736
- return self._tbl_version.update(value_spec, where, cascade)
745
+ status = self._tbl_version.update(value_spec, where, cascade)
746
+ FileCache.get().emit_eviction_warnings()
747
+ return status
737
748
 
738
749
  def batch_update(
739
750
  self, rows: Iterable[dict[str, Any]], cascade: bool = True,
@@ -790,9 +801,11 @@ class Table(SchemaObject):
790
801
  missing_cols = pk_col_names - set(col.name for col in col_vals.keys())
791
802
  raise excs.Error(f'Primary key columns ({", ".join(missing_cols)}) missing in {row_spec}')
792
803
  row_updates.append(col_vals)
793
- return self._tbl_version.batch_update(
804
+ status = self._tbl_version.batch_update(
794
805
  row_updates, rowids, error_if_not_exists=if_not_exists == 'error',
795
806
  insert_if_not_exists=if_not_exists == 'insert', cascade=cascade)
807
+ FileCache.get().emit_eviction_warnings()
808
+ return status
796
809
 
797
810
  def delete(self, where: Optional['pixeltable.exprs.Expr'] = None) -> UpdateStatus:
798
811
  """Delete rows in this table.
@@ -147,7 +147,7 @@ class TableVersion:
147
147
  module = importlib.import_module(module_name)
148
148
  self.iterator_cls = getattr(module, class_name)
149
149
  self.iterator_args = exprs.InlineDict.from_dict(tbl_md.view_md.iterator_args)
150
- output_schema, _ = self.iterator_cls.output_schema(**self.iterator_args.to_dict())
150
+ output_schema, _ = self.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
151
151
  self.num_iterator_cols = len(output_schema)
152
152
  assert tbl_md.view_md.iterator_args is not None
153
153
 
@@ -94,7 +94,7 @@ class View(Table):
94
94
  ]
95
95
  sig = func.Signature(InvalidType(), params)
96
96
  from pixeltable.exprs import FunctionCall
97
- FunctionCall.normalize_args(sig, bound_args)
97
+ FunctionCall.normalize_args(iterator_cls.__name__, sig, bound_args)
98
98
  except TypeError as e:
99
99
  raise Error(f'Cannot instantiate iterator with given arguments: {e}')
100
100
 
pixeltable/dataframe.py CHANGED
@@ -501,7 +501,7 @@ class DataFrame:
501
501
  elif isinstance(raw_expr, dict):
502
502
  select_list.append((exprs.InlineDict(raw_expr), name))
503
503
  elif isinstance(raw_expr, list):
504
- select_list.append((exprs.InlineArray(raw_expr), name))
504
+ select_list.append((exprs.InlineList(raw_expr), name))
505
505
  else:
506
506
  select_list.append((exprs.Literal(raw_expr), name))
507
507
  expr = select_list[-1][0]
pixeltable/env.py CHANGED
@@ -8,18 +8,20 @@ import importlib.util
8
8
  import inspect
9
9
  import logging
10
10
  import os
11
+ import shutil
12
+ import subprocess
11
13
  import sys
12
14
  import threading
13
15
  import uuid
14
16
  import warnings
15
17
  from dataclasses import dataclass
16
18
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Callable, Optional
19
+ from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar
18
20
  from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
19
21
 
20
22
  import pixeltable_pgserver
21
23
  import sqlalchemy as sql
22
- import yaml
24
+ import toml
23
25
  from tqdm import TqdmWarning
24
26
 
25
27
  import pixeltable.exceptions as excs
@@ -63,7 +65,7 @@ class Env:
63
65
  _log_to_stdout: bool
64
66
  _module_log_level: dict[str, int] # module name -> log level
65
67
  _config_file: Optional[Path]
66
- _config: Optional[dict[str, Any]]
68
+ _config: Optional[Config]
67
69
  _stdout_handler: logging.StreamHandler
68
70
  _initialized: bool
69
71
 
@@ -109,6 +111,7 @@ class Env:
109
111
  self._log_to_stdout = False
110
112
  self._module_log_level = {} # module name -> log level
111
113
 
114
+ # config
112
115
  self._config_file = None
113
116
  self._config = None
114
117
 
@@ -118,7 +121,8 @@ class Env:
118
121
  self._initialized = False
119
122
 
120
123
  @property
121
- def config(self):
124
+ def config(self) -> Config:
125
+ assert self._config is not None
122
126
  return self._config
123
127
 
124
128
  @property
@@ -226,30 +230,13 @@ class Env:
226
230
  home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
227
231
  assert self._home is None or self._home == home
228
232
  self._home = home
229
- self._config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self._home / 'config.yaml')))
233
+ self._config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self._home / 'config.toml')))
230
234
  self._media_dir = self._home / 'media'
231
235
  self._file_cache_dir = self._home / 'file_cache'
232
236
  self._dataset_cache_dir = self._home / 'dataset_cache'
233
237
  self._log_dir = self._home / 'logs'
234
238
  self._tmp_dir = self._home / 'tmp'
235
239
 
236
- # Read in the config
237
- if os.path.isfile(self._config_file):
238
- with open(self._config_file, 'r') as stream:
239
- try:
240
- self._config = yaml.safe_load(stream)
241
- except yaml.YAMLError as exc:
242
- self._logger.error(f'Could not read config file: {self._config_file}')
243
- self._config = {}
244
- else:
245
- self._config = {}
246
-
247
- # Disable spurious warnings
248
- warnings.simplefilter('ignore', category=TqdmWarning)
249
- if 'hide_warnings' in self._config and self._config['hide_warnings']:
250
- # Disable more warnings
251
- warnings.simplefilter('ignore', category=UserWarning)
252
-
253
240
  if self._home.exists() and not self._home.is_dir():
254
241
  raise RuntimeError(f'{self._home} is not a directory')
255
242
 
@@ -273,6 +260,22 @@ class Env:
273
260
  if not self._tmp_dir.exists():
274
261
  self._tmp_dir.mkdir()
275
262
 
263
+ # Read in the config
264
+ self._config = Config.from_file(self._config_file)
265
+ self._file_cache_size_g = self._config.get_float_value('file_cache_size_g')
266
+ if self._file_cache_size_g is None:
267
+ raise excs.Error(
268
+ 'pixeltable/file_cache_size_g is missing from configuration\n'
269
+ f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {self._config_file},\n'
270
+ 'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
271
+ )
272
+
273
+ # Disable spurious warnings
274
+ warnings.simplefilter('ignore', category=TqdmWarning)
275
+ if self._config.get_bool_value('hide_warnings'):
276
+ # Disable more warnings
277
+ warnings.simplefilter('ignore', category=UserWarning)
278
+
276
279
  # configure _logger to log to a file
277
280
  self._logfilename = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log'
278
281
  fh = logging.FileHandler(self._log_dir / self._logfilename, mode='w')
@@ -312,7 +315,7 @@ class Env:
312
315
  self._db_server = pixeltable_pgserver.get_server(self._pgdata_dir, cleanup_mode=None)
313
316
  self._db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
314
317
 
315
- tz_name = os.environ.get('PXT_TIME_ZONE', self._config.get('pxt_time_zone', None))
318
+ tz_name = self.config.get_string_value('time_zone')
316
319
  if tz_name is not None:
317
320
  # Validate tzname
318
321
  if not isinstance(tz_name, str):
@@ -439,21 +442,18 @@ class Env:
439
442
  if cl.client_obj is not None:
440
443
  return cl.client_obj # Already initialized
441
444
 
442
- # Construct a client. For each client parameter, first check if the parameter is in the environment;
443
- # if not, look in Pixeltable config from `config.yaml`.
445
+ # Construct a client, retrieving each parameter from config.
444
446
 
445
447
  init_kwargs: dict[str, str] = {}
446
448
  for param in cl.param_names:
447
- environ = f'{name.upper()}_{param.upper()}'
448
- if environ in os.environ:
449
- init_kwargs[param] = os.environ[environ]
450
- elif name.lower() in self._config and param in self._config[name.lower()]:
451
- init_kwargs[param] = self._config[name.lower()][param.lower()]
452
- if param not in init_kwargs or init_kwargs[param] == '':
449
+ arg = self._config.get_string_value(param, section=name)
450
+ if arg is not None and len(arg) > 0:
451
+ init_kwargs[param] = arg
452
+ else:
453
453
  raise excs.Error(
454
454
  f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
455
- f'To fix this, specify the `{environ}` environment variable, or put `{param.lower()}` in '
456
- f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.yaml.'
455
+ f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, or put `{param.lower()}` in '
456
+ f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
457
457
  )
458
458
 
459
459
  cl.client_obj = cl.init_fn(**init_kwargs)
@@ -486,6 +486,8 @@ class Env:
486
486
  """Check for and start runtime services"""
487
487
  self._start_web_server()
488
488
  self.__register_packages()
489
+ if self.is_installed_package('spacy'):
490
+ self.__init_spacy()
489
491
 
490
492
  def __register_packages(self) -> None:
491
493
  """Declare optional packages that are utilized by some parts of the code."""
@@ -500,10 +502,9 @@ class Env:
500
502
  self.__register_package('openpyxl')
501
503
  self.__register_package('pyarrow')
502
504
  self.__register_package('sentence_transformers', library_name='sentence-transformers')
503
- self.__register_package('spacy') # TODO: deal with en-core-web-sm
505
+ self.__register_package('spacy')
504
506
  self.__register_package('tiktoken')
505
507
  self.__register_package('together')
506
- self.__register_package('toml')
507
508
  self.__register_package('torch')
508
509
  self.__register_package('torchvision')
509
510
  self.__register_package('transformers')
@@ -511,10 +512,6 @@ class Env:
511
512
  self.__register_package('whisperx')
512
513
  self.__register_package('yolox', library_name='git+https://github.com/Megvii-BaseDetection/YOLOX@ac58e0a')
513
514
 
514
- if self.is_installed_package('spacy'):
515
- import spacy
516
- self._spacy_nlp = spacy.load('en_core_web_sm')
517
-
518
515
  def __register_package(self, package_name: str, library_name: Optional[str] = None) -> None:
519
516
  self.__optional_packages[package_name] = PackageInfo(
520
517
  is_installed=importlib.util.find_spec(package_name) is not None,
@@ -556,6 +553,35 @@ class Env:
556
553
  f'To fix this, run: `pip install -U {package_info.library_name}`'
557
554
  )
558
555
 
556
+ def __init_spacy(self) -> None:
557
+ """
558
+ spaCy relies on a pip-installed model to operate. In order to avoid requiring the model as a separate
559
+ dependency, we install it programmatically here. This should cause no problems, since the model packages
560
+ have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
561
+ """
562
+ import spacy
563
+ from spacy.cli.download import get_model_filename
564
+ spacy_model = 'en_core_web_sm'
565
+ spacy_model_version = '3.7.1'
566
+ filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
567
+ url = f'{spacy.about.__download_url__}/{filename}'
568
+ # Try to `pip install` the model. We set check=False; if the pip command fails, it's not necessarily
569
+ # a problem, because the model have been installed on a previous attempt.
570
+ self._logger.info(f'Ensuring spaCy model is installed: {filename}')
571
+ ret = subprocess.run([sys.executable, '-m', 'pip', 'install', '-qU', url], check=False)
572
+ if ret.returncode != 0:
573
+ self._logger.warn(f'pip install failed for spaCy model: {filename}')
574
+ try:
575
+ self._logger.info(f'Loading spaCy model: {spacy_model}')
576
+ self._spacy_nlp = spacy.load(spacy_model)
577
+ except Exception as exc:
578
+ self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
579
+ warnings.warn(
580
+ f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
581
+ excs.PixeltableWarning
582
+ )
583
+ self.__optional_packages['spacy'].is_installed = False
584
+
559
585
  def num_tmp_files(self) -> int:
560
586
  return len(glob.glob(f'{self._tmp_dir}/*'))
561
587
 
@@ -594,6 +620,7 @@ class Env:
594
620
 
595
621
  @property
596
622
  def spacy_nlp(self) -> spacy.Language:
623
+ Env.get().require_package('spacy')
597
624
  assert self._spacy_nlp is not None
598
625
  return self._spacy_nlp
599
626
 
@@ -614,7 +641,7 @@ def register_client(name: str) -> Callable:
614
641
  Pixeltable will attempt to load the client parameters from config. For each
615
642
  config parameter:
616
643
  - If an environment variable named MY_CLIENT_API_KEY (for example) is set, use it;
617
- - Otherwise, look for 'api_key' in the 'my_client' section of config.yaml.
644
+ - Otherwise, look for 'api_key' in the 'my_client' section of config.toml.
618
645
 
619
646
  If all config parameters are found, Pixeltable calls the initialization function;
620
647
  otherwise it throws an exception.
@@ -631,6 +658,79 @@ def register_client(name: str) -> Callable:
631
658
  return decorator
632
659
 
633
660
 
661
+ class Config:
662
+ """
663
+ The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
664
+ configuration values, which can be set in the config file or as environment variables.
665
+ """
666
+ __config: dict[str, Any]
667
+
668
+ T = TypeVar('T')
669
+
670
+ @classmethod
671
+ def from_file(cls, path: Path) -> Config:
672
+ """
673
+ Loads configuration from the specified TOML file. If the file does not exist, it will be
674
+ created and populated with the default configuration.
675
+ """
676
+ if os.path.isfile(path):
677
+ with open(path, 'r') as stream:
678
+ try:
679
+ config_dict = toml.load(stream)
680
+ except Exception as exc:
681
+ raise excs.Error(f'Could not read config file: {str(path)}') from exc
682
+ else:
683
+ config_dict = cls.__create_default_config(path)
684
+ with open(path, 'w') as stream:
685
+ try:
686
+ toml.dump(config_dict, stream)
687
+ except Exception as exc:
688
+ raise excs.Error(f'Could not write config file: {str(path)}') from exc
689
+ logging.getLogger('pixeltable').info(f'Created default config file at: {str(path)}')
690
+ return cls(config_dict)
691
+
692
+ @classmethod
693
+ def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
694
+ free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
695
+ # Default cache size is 1/5 of free disk space
696
+ file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
697
+ return {
698
+ 'pixeltable': {
699
+ 'file_cache_size_g': round(file_cache_size_g, 1),
700
+ 'hide_warnings': False,
701
+ }
702
+ }
703
+
704
+ def __init__(self, config: dict[str, Any]) -> None:
705
+ self.__config = config
706
+
707
+ def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
708
+ env_var = f'{section.upper()}_{key.upper()}'
709
+ if env_var in os.environ:
710
+ value = os.environ[env_var]
711
+ elif section in self.__config and key in self.__config[section]:
712
+ value = self.__config[section][key]
713
+ else:
714
+ return None
715
+
716
+ try:
717
+ return expected_type(value) # type: ignore[call-arg]
718
+ except ValueError:
719
+ raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
720
+
721
+ def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
722
+ return self.get_value(key, str, section)
723
+
724
+ def get_int_value(self, key: str, section: str = 'pixeltable') -> Optional[int]:
725
+ return self.get_value(key, int, section)
726
+
727
+ def get_float_value(self, key: str, section: str = 'pixeltable') -> Optional[float]:
728
+ return self.get_value(key, float, section)
729
+
730
+ def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
731
+ return self.get_value(key, bool, section)
732
+
733
+
634
734
  _registered_clients: dict[str, ApiClient] = {}
635
735
 
636
736
 
pixeltable/exceptions.py CHANGED
@@ -1,6 +1,9 @@
1
- from typing import List, Any
2
- from types import TracebackType
3
1
  from dataclasses import dataclass
2
+ from types import TracebackType
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ if TYPE_CHECKING:
6
+ from pixeltable import exprs
4
7
 
5
8
 
6
9
  class Error(Exception):
@@ -9,9 +12,13 @@ class Error(Exception):
9
12
 
10
13
  @dataclass
11
14
  class ExprEvalError(Exception):
12
- expr: Any # exprs.Expr, but we're not importing pixeltable.exprs to avoid circular imports
15
+ expr: 'exprs.Expr'
13
16
  expr_msg: str
14
17
  exc: Exception
15
18
  exc_tb: TracebackType
16
- input_vals: List[Any]
17
- row_num: int
19
+ input_vals: list[Any]
20
+ row_num: int
21
+
22
+
23
+ class PixeltableWarning(Warning):
24
+ pass
@@ -1,10 +1,12 @@
1
- from typing import Generator, Optional
1
+ import inspect
2
+ from typing import Iterator, Optional
2
3
 
3
- from .data_row_batch import DataRowBatch
4
- from .exec_node import ExecNode
5
4
  import pixeltable.catalog as catalog
6
- import pixeltable.exprs as exprs
7
5
  import pixeltable.exceptions as excs
6
+ import pixeltable.exprs as exprs
7
+
8
+ from .data_row_batch import DataRowBatch
9
+ from .exec_node import ExecNode
8
10
 
9
11
 
10
12
  class ComponentIterationNode(ExecNode):
@@ -12,7 +14,7 @@ class ComponentIterationNode(ExecNode):
12
14
 
13
15
  Returns row batches of OUTPUT_BATCH_SIZE size.
14
16
  """
15
- OUTPUT_BATCH_SIZE = 1024
17
+ __OUTPUT_BATCH_SIZE = 1024
16
18
 
17
19
  def __init__(self, view: catalog.TableVersion, input: ExecNode):
18
20
  assert view.is_component_view()
@@ -23,57 +25,76 @@ class ComponentIterationNode(ExecNode):
23
25
  self.iterator_args = iterator_args[0]
24
26
  assert isinstance(self.iterator_args, exprs.InlineDict)
25
27
  self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
26
- self.iterator_output_schema, self.unstored_column_names = \
27
- self.view.iterator_cls.output_schema(**self.iterator_args.to_dict())
28
+ self.iterator_output_schema, self.unstored_column_names = (
29
+ self.view.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
30
+ )
28
31
  self.iterator_output_fields = list(self.iterator_output_schema.keys())
29
- self.iterator_output_cols = \
30
- {field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields}
32
+ self.iterator_output_cols = {
33
+ field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields
34
+ }
31
35
  # referenced iterator output fields
32
36
  self.refd_output_slot_idxs = {
33
37
  e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
34
38
  if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
35
39
  }
36
- self._output: Optional[Generator[DataRowBatch, None, None]] = None
40
+ self.__output: Optional[Iterator[DataRowBatch]] = None
37
41
 
38
- def _output_batches(self) -> Generator[DataRowBatch, None, None]:
42
+ def __output_batches(self) -> Iterator[DataRowBatch]:
39
43
  output_batch = DataRowBatch(self.view, self.row_builder)
40
44
  for input_batch in self.input:
41
45
  for input_row in input_batch:
42
46
  self.row_builder.eval(input_row, self.iterator_args_ctx)
43
47
  iterator_args = input_row[self.iterator_args.slot_idx]
44
- iterator = self.view.iterator_cls(**iterator_args)
45
- for pos, component_dict in enumerate(iterator):
46
- output_row = output_batch.add_row()
47
- input_row.copy(output_row)
48
- # we're expanding the input and need to add the iterator position to the pk
49
- pk = output_row.pk[:-1] + (pos,) + output_row.pk[-1:]
50
- output_row.set_pk(pk)
51
-
52
- # verify and copy component_dict fields to their respective slots in output_row
53
- for field_name, field_val in component_dict.items():
54
- if field_name not in self.iterator_output_fields:
55
- raise excs.Error(
56
- f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
57
- if field_name not in self.refd_output_slot_idxs:
58
- # we can ignore this
59
- continue
60
- output_col = self.iterator_output_cols[field_name]
61
- output_col.col_type.validate_literal(field_val)
62
- output_row[self.refd_output_slot_idxs[field_name]] = field_val
63
- if len(component_dict) != len(self.iterator_output_fields):
64
- missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
65
- raise excs.Error(
66
- f'Invalid output of {self.view.iterator_cls.__name__}: '
67
- f'missing fields {", ".join(missing_fields)}')
68
-
69
- if len(output_batch) == self.OUTPUT_BATCH_SIZE:
70
- yield output_batch
71
- output_batch = DataRowBatch(self.view, self.row_builder)
48
+ assert isinstance(iterator_args, dict)
49
+ # We need to ensure that all of the required (non-nullable) parameters of the iterator are
50
+ # specified and are not null. If any of them are null, then we skip this row (i.e., we emit 0
51
+ # output rows for this input row).
52
+ if self.__non_nullable_args_specified(iterator_args):
53
+ iterator = self.view.iterator_cls(**iterator_args)
54
+ for pos, component_dict in enumerate(iterator):
55
+ output_row = output_batch.add_row()
56
+ input_row.copy(output_row)
57
+ # we're expanding the input and need to add the iterator position to the pk
58
+ self.__populate_output_row(output_row, pos, component_dict)
59
+ if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
60
+ yield output_batch
61
+ output_batch = DataRowBatch(self.view, self.row_builder)
72
62
 
73
63
  if len(output_batch) > 0:
74
64
  yield output_batch
75
65
 
66
+ def __non_nullable_args_specified(self, iterator_args: dict) -> bool:
67
+ """
68
+ Returns true if all non-nullable iterator arguments are not `None`.
69
+ """
70
+ input_schema = self.view.iterator_cls.input_schema()
71
+ for arg_name, arg_value in iterator_args.items():
72
+ col_type = input_schema[arg_name]
73
+ if arg_value is None and not col_type.nullable:
74
+ return False
75
+ return True
76
+
77
+ def __populate_output_row(self, output_row: exprs.DataRow, pos: int, component_dict: dict) -> None:
78
+ pk = output_row.pk[:-1] + (pos,) + output_row.pk[-1:]
79
+ output_row.set_pk(pk)
80
+ # verify and copy component_dict fields to their respective slots in output_row
81
+ for field_name, field_val in component_dict.items():
82
+ if field_name not in self.iterator_output_fields:
83
+ raise excs.Error(
84
+ f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
85
+ if field_name not in self.refd_output_slot_idxs:
86
+ # we can ignore this
87
+ continue
88
+ output_col = self.iterator_output_cols[field_name]
89
+ output_col.col_type.validate_literal(field_val)
90
+ output_row[self.refd_output_slot_idxs[field_name]] = field_val
91
+ if len(component_dict) != len(self.iterator_output_fields):
92
+ missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
93
+ raise excs.Error(
94
+ f'Invalid output of {self.view.iterator_cls.__name__}: '
95
+ f'missing fields {", ".join(missing_fields)}')
96
+
76
97
  def __next__(self) -> DataRowBatch:
77
- if self._output is None:
78
- self._output = self._output_batches()
79
- return next(self._output)
98
+ if self.__output is None:
99
+ self.__output = self.__output_batches()
100
+ return next(self.__output)
@@ -9,8 +9,7 @@ from .expr import Expr
9
9
  from .expr_set import ExprSet
10
10
  from .function_call import FunctionCall
11
11
  from .in_predicate import InPredicate
12
- from .inline_array import InlineArray
13
- from .inline_dict import InlineDict
12
+ from .inline_expr import InlineArray, InlineDict, InlineList
14
13
  from .is_null import IsNull
15
14
  from .json_mapper import JsonMapper
16
15
  from .json_path import RELATIVE_PATH_ROOT, JsonPath
pixeltable/exprs/expr.py CHANGED
@@ -356,15 +356,14 @@ class Expr(abc.ABC):
356
356
  """
357
357
  if isinstance(o, Expr):
358
358
  return o
359
- # Try to create a literal. We need to check for InlineArray/InlineDict
360
- # first, to prevent arrays from inappropriately being interpreted as JsonType
359
+ # Try to create a literal. We need to check for InlineList/InlineDict
360
+ # first, to prevent them from inappropriately being interpreted as JsonType
361
361
  # literals.
362
- # TODO: general cleanup of InlineArray/InlineDict
363
362
  if isinstance(o, list):
364
- from .inline_array import InlineArray
365
- return InlineArray(tuple(o))
363
+ from .inline_expr import InlineList
364
+ return InlineList(o)
366
365
  if isinstance(o, dict):
367
- from .inline_dict import InlineDict
366
+ from .inline_expr import InlineDict
368
367
  return InlineDict(o)
369
368
  obj_type = ts.ColumnType.infer_literal_type(o)
370
369
  if obj_type is not None: