pixeltable 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (127) hide show
  1. pixeltable/__init__.py +5 -3
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -0
  4. pixeltable/catalog/catalog.py +335 -128
  5. pixeltable/catalog/column.py +22 -5
  6. pixeltable/catalog/dir.py +19 -6
  7. pixeltable/catalog/insertable_table.py +34 -37
  8. pixeltable/catalog/named_function.py +0 -4
  9. pixeltable/catalog/schema_object.py +28 -42
  10. pixeltable/catalog/table.py +193 -158
  11. pixeltable/catalog/table_version.py +191 -232
  12. pixeltable/catalog/table_version_handle.py +50 -0
  13. pixeltable/catalog/table_version_path.py +49 -33
  14. pixeltable/catalog/view.py +56 -96
  15. pixeltable/config.py +103 -0
  16. pixeltable/dataframe.py +89 -89
  17. pixeltable/env.py +98 -168
  18. pixeltable/exec/aggregation_node.py +5 -4
  19. pixeltable/exec/cache_prefetch_node.py +1 -1
  20. pixeltable/exec/component_iteration_node.py +13 -9
  21. pixeltable/exec/data_row_batch.py +3 -3
  22. pixeltable/exec/exec_context.py +0 -4
  23. pixeltable/exec/exec_node.py +3 -2
  24. pixeltable/exec/expr_eval/schedulers.py +2 -1
  25. pixeltable/exec/in_memory_data_node.py +9 -4
  26. pixeltable/exec/row_update_node.py +1 -2
  27. pixeltable/exec/sql_node.py +20 -16
  28. pixeltable/exprs/__init__.py +2 -0
  29. pixeltable/exprs/arithmetic_expr.py +7 -11
  30. pixeltable/exprs/array_slice.py +1 -1
  31. pixeltable/exprs/column_property_ref.py +3 -3
  32. pixeltable/exprs/column_ref.py +12 -13
  33. pixeltable/exprs/comparison.py +3 -6
  34. pixeltable/exprs/compound_predicate.py +4 -4
  35. pixeltable/exprs/expr.py +31 -22
  36. pixeltable/exprs/expr_dict.py +3 -3
  37. pixeltable/exprs/expr_set.py +1 -1
  38. pixeltable/exprs/function_call.py +110 -80
  39. pixeltable/exprs/globals.py +3 -3
  40. pixeltable/exprs/in_predicate.py +1 -1
  41. pixeltable/exprs/inline_expr.py +3 -3
  42. pixeltable/exprs/is_null.py +1 -1
  43. pixeltable/exprs/json_mapper.py +2 -2
  44. pixeltable/exprs/json_path.py +17 -10
  45. pixeltable/exprs/literal.py +1 -1
  46. pixeltable/exprs/method_ref.py +2 -2
  47. pixeltable/exprs/row_builder.py +8 -17
  48. pixeltable/exprs/rowid_ref.py +21 -10
  49. pixeltable/exprs/similarity_expr.py +5 -5
  50. pixeltable/exprs/sql_element_cache.py +1 -1
  51. pixeltable/exprs/type_cast.py +2 -3
  52. pixeltable/exprs/variable.py +2 -2
  53. pixeltable/ext/__init__.py +2 -0
  54. pixeltable/ext/functions/__init__.py +2 -0
  55. pixeltable/ext/functions/yolox.py +3 -3
  56. pixeltable/func/__init__.py +3 -1
  57. pixeltable/func/aggregate_function.py +9 -9
  58. pixeltable/func/callable_function.py +3 -4
  59. pixeltable/func/expr_template_function.py +6 -16
  60. pixeltable/func/function.py +48 -14
  61. pixeltable/func/function_registry.py +1 -3
  62. pixeltable/func/query_template_function.py +5 -12
  63. pixeltable/func/signature.py +23 -22
  64. pixeltable/func/tools.py +3 -3
  65. pixeltable/func/udf.py +6 -4
  66. pixeltable/functions/__init__.py +2 -0
  67. pixeltable/functions/fireworks.py +7 -4
  68. pixeltable/functions/globals.py +4 -5
  69. pixeltable/functions/huggingface.py +1 -5
  70. pixeltable/functions/image.py +17 -7
  71. pixeltable/functions/llama_cpp.py +1 -1
  72. pixeltable/functions/mistralai.py +1 -1
  73. pixeltable/functions/ollama.py +4 -4
  74. pixeltable/functions/openai.py +19 -19
  75. pixeltable/functions/string.py +23 -30
  76. pixeltable/functions/timestamp.py +11 -6
  77. pixeltable/functions/together.py +14 -12
  78. pixeltable/functions/util.py +1 -1
  79. pixeltable/functions/video.py +5 -4
  80. pixeltable/functions/vision.py +6 -9
  81. pixeltable/functions/whisper.py +3 -3
  82. pixeltable/globals.py +246 -260
  83. pixeltable/index/__init__.py +2 -0
  84. pixeltable/index/base.py +1 -1
  85. pixeltable/index/btree.py +3 -1
  86. pixeltable/index/embedding_index.py +11 -5
  87. pixeltable/io/external_store.py +11 -12
  88. pixeltable/io/label_studio.py +4 -3
  89. pixeltable/io/parquet.py +57 -56
  90. pixeltable/iterators/__init__.py +4 -2
  91. pixeltable/iterators/audio.py +11 -11
  92. pixeltable/iterators/document.py +10 -10
  93. pixeltable/iterators/string.py +1 -2
  94. pixeltable/iterators/video.py +14 -15
  95. pixeltable/metadata/__init__.py +9 -5
  96. pixeltable/metadata/converters/convert_10.py +0 -1
  97. pixeltable/metadata/converters/convert_15.py +0 -2
  98. pixeltable/metadata/converters/convert_23.py +0 -2
  99. pixeltable/metadata/converters/convert_24.py +3 -3
  100. pixeltable/metadata/converters/convert_25.py +1 -1
  101. pixeltable/metadata/converters/convert_27.py +0 -2
  102. pixeltable/metadata/converters/convert_28.py +0 -2
  103. pixeltable/metadata/converters/convert_29.py +7 -8
  104. pixeltable/metadata/converters/util.py +7 -7
  105. pixeltable/metadata/schema.py +27 -19
  106. pixeltable/plan.py +68 -40
  107. pixeltable/share/__init__.py +2 -0
  108. pixeltable/share/packager.py +15 -12
  109. pixeltable/share/publish.py +3 -5
  110. pixeltable/store.py +37 -38
  111. pixeltable/type_system.py +41 -28
  112. pixeltable/utils/coco.py +4 -4
  113. pixeltable/utils/console_output.py +1 -3
  114. pixeltable/utils/description_helper.py +1 -1
  115. pixeltable/utils/documents.py +3 -3
  116. pixeltable/utils/filecache.py +20 -9
  117. pixeltable/utils/formatter.py +2 -3
  118. pixeltable/utils/media_store.py +1 -1
  119. pixeltable/utils/pytorch.py +1 -1
  120. pixeltable/utils/sql.py +4 -4
  121. pixeltable/utils/transactional_directory.py +2 -1
  122. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/METADATA +1 -1
  123. pixeltable-0.3.8.dist-info/RECORD +174 -0
  124. pixeltable-0.3.6.dist-info/RECORD +0 -172
  125. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/LICENSE +0 -0
  126. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/WHEEL +0 -0
  127. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/entry_points.txt +0 -0
pixeltable/env.py CHANGED
@@ -16,19 +16,19 @@ import threading
16
16
  import uuid
17
17
  import warnings
18
18
  from abc import abstractmethod
19
+ from contextlib import contextmanager
19
20
  from dataclasses import dataclass, field
20
21
  from pathlib import Path
21
22
  from sys import stdout
22
- from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar
23
+ from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
23
24
  from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
24
25
 
25
26
  import pixeltable_pgserver
26
27
  import sqlalchemy as sql
27
- import toml
28
28
  from tqdm import TqdmWarning
29
29
 
30
- import pixeltable.exceptions as excs
31
- from pixeltable import metadata
30
+ from pixeltable import exceptions as excs
31
+ from pixeltable.config import Config
32
32
  from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
33
33
  from pixeltable.utils.http_server import make_server
34
34
 
@@ -47,9 +47,9 @@ class Env:
47
47
  """
48
48
 
49
49
  _instance: Optional[Env] = None
50
+ __initializing: bool = False
50
51
  _log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
51
52
 
52
- _home: Optional[Path]
53
53
  _media_dir: Optional[Path]
54
54
  _file_cache_dir: Optional[Path] # cached media files with external URL
55
55
  _dataset_cache_dir: Optional[Path] # cached datasets (eg, pytorch or COCO)
@@ -69,19 +69,18 @@ class Env:
69
69
  _httpd: Optional[http.server.HTTPServer]
70
70
  _http_address: Optional[str]
71
71
  _logger: logging.Logger
72
- _console_logger: ConsoleLogger
73
72
  _default_log_level: int
74
73
  _logfilename: Optional[str]
75
74
  _log_to_stdout: bool
76
75
  _module_log_level: dict[str, int] # module name -> log level
77
- _config_file: Optional[Path]
78
- _config: Optional[Config]
79
76
  _file_cache_size_g: float
80
77
  _pxt_api_key: Optional[str]
81
78
  _stdout_handler: logging.StreamHandler
82
79
  _initialized: bool
83
80
 
84
81
  _resource_pool_info: dict[str, Any]
82
+ _current_conn: Optional[sql.Connection]
83
+ _current_session: Optional[sql.orm.Session]
85
84
 
86
85
  @classmethod
87
86
  def get(cls) -> Env:
@@ -91,15 +90,17 @@ class Env:
91
90
 
92
91
  @classmethod
93
92
  def _init_env(cls, reinit_db: bool = False) -> None:
93
+ assert not cls.__initializing, 'Circular env initialization detected.'
94
+ cls.__initializing = True
94
95
  env = Env()
95
96
  env._set_up(reinit_db=reinit_db)
96
97
  env._upgrade_metadata()
97
98
  cls._instance = env
99
+ cls.__initializing = False
98
100
 
99
101
  def __init__(self):
100
102
  assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
101
103
 
102
- self._home = None
103
104
  self._media_dir = None # computed media files
104
105
  self._file_cache_dir = None # cached media files with external URL
105
106
  self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
@@ -127,21 +128,14 @@ class Env:
127
128
  self._log_to_stdout = False
128
129
  self._module_log_level = {} # module name -> log level
129
130
 
130
- # config
131
- self._config_file = None
132
- self._config = None
133
-
134
131
  # create logging handler to also log to stdout
135
132
  self._stdout_handler = logging.StreamHandler(stream=sys.stdout)
136
133
  self._stdout_handler.setFormatter(logging.Formatter(self._log_fmt_str))
137
134
  self._initialized = False
138
135
 
139
136
  self._resource_pool_info = {}
140
-
141
- @property
142
- def config(self) -> Config:
143
- assert self._config is not None
144
- return self._config
137
+ self._current_conn = None
138
+ self._current_session = None
145
139
 
146
140
  @property
147
141
  def db_url(self) -> str:
@@ -166,6 +160,33 @@ class Env:
166
160
  self.engine.dispose()
167
161
  self._create_engine(time_zone_name=tz_name)
168
162
 
163
+ @property
164
+ def conn(self) -> Optional[sql.Connection]:
165
+ assert self._current_conn is not None
166
+ return self._current_conn
167
+
168
+ @property
169
+ def session(self) -> Optional[sql.orm.Session]:
170
+ assert self._current_session is not None
171
+ return self._current_session
172
+
173
+ @contextmanager
174
+ def begin_xact(self) -> Iterator[sql.Connection]:
175
+ """Return a context manager that yields a connection to the database. Idempotent."""
176
+ if self._current_conn is None:
177
+ assert self._current_session is None
178
+ with self.engine.begin() as conn, sql.orm.Session(conn) as session:
179
+ self._current_conn = conn
180
+ self._current_session = session
181
+ try:
182
+ yield conn
183
+ finally:
184
+ self._current_session = None
185
+ self._current_conn = None
186
+ else:
187
+ assert self._current_session is not None
188
+ yield self._current_conn
189
+
169
190
  def configure_logging(
170
191
  self,
171
192
  *,
@@ -233,10 +254,7 @@ class Env:
233
254
  for module_name in path_parts[:max_idx]:
234
255
  if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
235
256
  return True
236
- if record.levelno >= self._default_log_level:
237
- return True
238
- else:
239
- return False
257
+ return record.levelno >= self._default_log_level
240
258
 
241
259
  @property
242
260
  def console_logger(self) -> ConsoleLogger:
@@ -248,28 +266,14 @@ class Env:
248
266
 
249
267
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
250
268
 
269
+ config = Config.get()
270
+
251
271
  self._initialized = True
252
- home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
253
- assert self._home is None or self._home == home
254
- self._home = home
255
- self._config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self._home / 'config.toml')))
256
- self._media_dir = self._home / 'media'
257
- self._file_cache_dir = self._home / 'file_cache'
258
- self._dataset_cache_dir = self._home / 'dataset_cache'
259
- self._log_dir = self._home / 'logs'
260
- self._tmp_dir = self._home / 'tmp'
261
-
262
- if self._home.exists() and not self._home.is_dir():
263
- raise RuntimeError(f'{self._home} is not a directory')
264
-
265
- if not self._home.exists():
266
- # we don't have our logger set up yet, so print to stdout
267
- print(f'Creating a Pixeltable instance at: {self._home}')
268
- self._home.mkdir()
269
- # TODO (aaron-siegel) This is the existing behavior, but it seems scary. If something happens to
270
- # self._home, it will cause the DB to be destroyed even if pgdata is in an alternate location.
271
- # PROPOSAL: require `reinit_db` to be set explicitly to destroy the DB.
272
- reinit_db = True
272
+ self._media_dir = Config.get().home / 'media'
273
+ self._file_cache_dir = Config.get().home / 'file_cache'
274
+ self._dataset_cache_dir = Config.get().home / 'dataset_cache'
275
+ self._log_dir = Config.get().home / 'logs'
276
+ self._tmp_dir = Config.get().home / 'tmp'
273
277
 
274
278
  if not self._media_dir.exists():
275
279
  self._media_dir.mkdir()
@@ -282,26 +286,24 @@ class Env:
282
286
  if not self._tmp_dir.exists():
283
287
  self._tmp_dir.mkdir()
284
288
 
285
- # Read in the config
286
- self._config = Config.from_file(self._config_file)
287
- self._file_cache_size_g = self._config.get_float_value('file_cache_size_g')
289
+ self._file_cache_size_g = config.get_float_value('file_cache_size_g')
288
290
  if self._file_cache_size_g is None:
289
291
  raise excs.Error(
290
292
  'pixeltable/file_cache_size_g is missing from configuration\n'
291
- f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {self._config_file},\n'
293
+ f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {Config.get().config_file},\n'
292
294
  'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
293
295
  )
294
- self._pxt_api_key = self._config.get_string_value('api_key')
296
+ self._pxt_api_key = config.get_string_value('api_key')
295
297
 
296
298
  # Disable spurious warnings
297
299
  warnings.simplefilter('ignore', category=TqdmWarning)
298
- if self._config.get_bool_value('hide_warnings'):
300
+ if config.get_bool_value('hide_warnings'):
299
301
  # Disable more warnings
300
302
  warnings.simplefilter('ignore', category=UserWarning)
301
303
  warnings.simplefilter('ignore', category=FutureWarning)
302
304
 
303
305
  # Set verbose level for user visible console messages
304
- verbosity = map_level(self._config.get_int_value('verbosity'))
306
+ verbosity = map_level(config.get_int_value('verbosity'))
305
307
  stdout_handler = ConsoleOutputHandler(stream=stdout)
306
308
  stdout_handler.setLevel(verbosity)
307
309
  stdout_handler.addFilter(ConsoleMessageFilter())
@@ -339,7 +341,7 @@ class Env:
339
341
  self.clear_tmp_dir()
340
342
 
341
343
  self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
342
- self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(self._home / 'pgdata')))
344
+ self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(Config.get().home / 'pgdata')))
343
345
 
344
346
  # cleanup_mode=None will leave the postgres process running after Python exits
345
347
  # cleanup_mode='stop' will terminate the postgres process when Python exits
@@ -349,11 +351,11 @@ class Env:
349
351
  self._db_server = pixeltable_pgserver.get_server(self._pgdata_dir, cleanup_mode=cleanup_mode)
350
352
  self._db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
351
353
 
352
- tz_name = self.config.get_string_value('time_zone')
354
+ tz_name = config.get_string_value('time_zone')
353
355
  if tz_name is not None:
354
356
  # Validate tzname
355
357
  if not isinstance(tz_name, str):
356
- self._logger.error(f'Invalid time zone specified in configuration.')
358
+ self._logger.error('Invalid time zone specified in configuration.')
357
359
  else:
358
360
  try:
359
361
  _ = ZoneInfo(tz_name)
@@ -375,9 +377,9 @@ class Env:
375
377
  self._create_engine(time_zone_name=tz_name, echo=echo)
376
378
 
377
379
  if create_db:
378
- from pixeltable.metadata import schema
380
+ from pixeltable import metadata
379
381
 
380
- schema.base_metadata.create_all(self._sa_engine)
382
+ metadata.schema.base_metadata.create_all(self._sa_engine)
381
383
  metadata.create_system_info(self._sa_engine)
382
384
 
383
385
  self.console_logger.info(f'Connected to Pixeltable database at: {self.db_url}')
@@ -460,6 +462,8 @@ class Env:
460
462
  engine.dispose()
461
463
 
462
464
  def _upgrade_metadata(self) -> None:
465
+ from pixeltable import metadata
466
+
463
467
  metadata.upgrade_md(self._sa_engine)
464
468
 
465
469
  @property
@@ -467,7 +471,7 @@ class Env:
467
471
  if self._pxt_api_key is None:
468
472
  raise excs.Error(
469
473
  'No API key is configured. Set the PIXELTABLE_API_KEY environment variable, or add an entry to '
470
- f'config.toml as described here:\nhttps://pixeltable.github.io/pixeltable/config/'
474
+ 'config.toml as described here:\nhttps://pixeltable.github.io/pixeltable/config/'
471
475
  )
472
476
  return self._pxt_api_key
473
477
 
@@ -486,14 +490,14 @@ class Env:
486
490
 
487
491
  init_kwargs: dict[str, str] = {}
488
492
  for param in cl.param_names:
489
- arg = self._config.get_string_value(param, section=name)
493
+ arg = Config.get().get_string_value(param, section=name)
490
494
  if arg is not None and len(arg) > 0:
491
495
  init_kwargs[param] = arg
492
496
  else:
493
497
  raise excs.Error(
494
498
  f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
495
- f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, or put `{param.lower()}` in '
496
- f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
499
+ f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, '
500
+ f'or put `{param.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
497
501
  )
498
502
 
499
503
  cl.client_obj = cl.init_fn(**init_kwargs)
@@ -526,8 +530,6 @@ class Env:
526
530
  """Check for and start runtime services"""
527
531
  self._start_web_server()
528
532
  self.__register_packages()
529
- if self.is_installed_package('spacy'):
530
- self.__init_spacy()
531
533
 
532
534
  def __register_packages(self) -> None:
533
535
  """Declare optional packages that are utilized by some parts of the code."""
@@ -590,7 +592,8 @@ class Env:
590
592
  if not package_info.is_installed:
591
593
  # Still not found.
592
594
  raise excs.Error(
593
- f'This feature requires the `{package_name}` package. To install it, run: `pip install -U {package_info.library_name}`'
595
+ f'This feature requires the `{package_name}` package. To install it, run: '
596
+ f'`pip install -U {package_info.library_name}`'
594
597
  )
595
598
 
596
599
  if min_version is None:
@@ -603,41 +606,12 @@ class Env:
603
606
 
604
607
  if min_version > package_info.version:
605
608
  raise excs.Error(
606
- f'The installed version of package `{package_name}` is {".".join(str(v) for v in package_info.version)}, '
609
+ f'The installed version of package `{package_name}` is '
610
+ f'{".".join(str(v) for v in package_info.version)}, '
607
611
  f'but version >={".".join(str(v) for v in min_version)} is required. '
608
612
  f'To fix this, run: `pip install -U {package_info.library_name}`'
609
613
  )
610
614
 
611
- def __init_spacy(self) -> None:
612
- """
613
- spaCy relies on a pip-installed model to operate. In order to avoid requiring the model as a separate
614
- dependency, we install it programmatically here. This should cause no problems, since the model packages
615
- have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
616
- """
617
- import spacy
618
- from spacy.cli.download import get_model_filename
619
-
620
- spacy_model = 'en_core_web_sm'
621
- spacy_model_version = '3.7.1'
622
- filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
623
- url = f'{spacy.about.__download_url__}/{filename}'
624
- # Try to `pip install` the model. We set check=False; if the pip command fails, it's not necessarily
625
- # a problem, because the model have been installed on a previous attempt.
626
- self._logger.info(f'Ensuring spaCy model is installed: {filename}')
627
- ret = subprocess.run([sys.executable, '-m', 'pip', 'install', '-qU', url], check=False)
628
- if ret.returncode != 0:
629
- self._logger.warning(f'pip install failed for spaCy model: {filename}')
630
- try:
631
- self._logger.info(f'Loading spaCy model: {spacy_model}')
632
- self._spacy_nlp = spacy.load(spacy_model)
633
- except Exception as exc:
634
- self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
635
- warnings.warn(
636
- f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
637
- excs.PixeltableWarning,
638
- )
639
- self.__optional_packages['spacy'].is_installed = False
640
-
641
615
  def clear_tmp_dir(self) -> None:
642
616
  for path in glob.glob(f'{self._tmp_dir}/*'):
643
617
  if os.path.isdir(path):
@@ -660,11 +634,6 @@ class Env:
660
634
  self._resource_pool_info[pool_id] = info
661
635
  return info
662
636
 
663
- @property
664
- def home(self) -> Path:
665
- assert self._home is not None
666
- return self._home
667
-
668
637
  @property
669
638
  def media_dir(self) -> Path:
670
639
  assert self._media_dir is not None
@@ -693,9 +662,36 @@ class Env:
693
662
  @property
694
663
  def spacy_nlp(self) -> spacy.Language:
695
664
  Env.get().require_package('spacy')
665
+ if self._spacy_nlp is None:
666
+ self.__init_spacy()
696
667
  assert self._spacy_nlp is not None
697
668
  return self._spacy_nlp
698
669
 
670
+ def __init_spacy(self) -> None:
671
+ """
672
+ spaCy relies on a pip-installed model to operate. In order to avoid requiring the model as a separate
673
+ dependency, we install it programmatically here. This should cause no problems, since the model packages
674
+ have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
675
+ """
676
+ import spacy
677
+ from spacy.cli.download import get_model_filename
678
+
679
+ spacy_model = 'en_core_web_sm'
680
+ spacy_model_version = '3.7.1'
681
+ filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
682
+ url = f'{spacy.about.__download_url__}/{filename}'
683
+ # Try to `pip install` the model. We set check=False; if the pip command fails, it's not necessarily
684
+ # a problem, because the model might have been installed on a previous attempt.
685
+ self._logger.info(f'Ensuring spaCy model is installed: {filename}')
686
+ ret = subprocess.run([sys.executable, '-m', 'pip', 'install', '-qU', url], check=False)
687
+ if ret.returncode != 0:
688
+ self._logger.warning(f'pip install failed for spaCy model: {filename}')
689
+ self._logger.info(f'Loading spaCy model: {spacy_model}')
690
+ try:
691
+ self._spacy_nlp = spacy.load(spacy_model)
692
+ except Exception as exc:
693
+ raise excs.Error(f'Failed to load spaCy model: {spacy_model}') from exc
694
+
699
695
 
700
696
  def register_client(name: str) -> Callable:
701
697
  """Decorator that registers a third-party API client for use by Pixeltable.
@@ -723,7 +719,6 @@ def register_client(name: str) -> Callable:
723
719
  """
724
720
 
725
721
  def decorator(fn: Callable) -> None:
726
- global _registered_clients
727
722
  sig = inspect.signature(fn)
728
723
  param_names = list(sig.parameters.keys())
729
724
  _registered_clients[name] = ApiClient(init_fn=fn, param_names=param_names)
@@ -731,73 +726,6 @@ def register_client(name: str) -> Callable:
731
726
  return decorator
732
727
 
733
728
 
734
- class Config:
735
- """
736
- The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
737
- configuration values, which can be set in the config file or as environment variables.
738
- """
739
-
740
- __config: dict[str, Any]
741
-
742
- @classmethod
743
- def from_file(cls, path: Path) -> Config:
744
- """
745
- Loads configuration from the specified TOML file. If the file does not exist, it will be
746
- created and populated with the default configuration.
747
- """
748
- if os.path.isfile(path):
749
- with open(path, 'r') as stream:
750
- try:
751
- config_dict = toml.load(stream)
752
- except Exception as exc:
753
- raise excs.Error(f'Could not read config file: {str(path)}') from exc
754
- else:
755
- config_dict = cls.__create_default_config(path)
756
- with open(path, 'w') as stream:
757
- try:
758
- toml.dump(config_dict, stream)
759
- except Exception as exc:
760
- raise excs.Error(f'Could not write config file: {str(path)}') from exc
761
- logging.getLogger('pixeltable').info(f'Created default config file at: {str(path)}')
762
- return cls(config_dict)
763
-
764
- @classmethod
765
- def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
766
- free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
767
- # Default cache size is 1/5 of free disk space
768
- file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
769
- return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
770
-
771
- def __init__(self, config: dict[str, Any]) -> None:
772
- self.__config = config
773
-
774
- def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
775
- env_var = f'{section.upper()}_{key.upper()}'
776
- if env_var in os.environ:
777
- value = os.environ[env_var]
778
- elif section in self.__config and key in self.__config[section]:
779
- value = self.__config[section][key]
780
- else:
781
- return None
782
-
783
- try:
784
- return expected_type(value) # type: ignore[call-arg]
785
- except ValueError:
786
- raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
787
-
788
- def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
789
- return self.get_value(key, str, section)
790
-
791
- def get_int_value(self, key: str, section: str = 'pixeltable') -> Optional[int]:
792
- return self.get_value(key, int, section)
793
-
794
- def get_float_value(self, key: str, section: str = 'pixeltable') -> Optional[float]:
795
- return self.get_value(key, float, section)
796
-
797
- def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
798
- return self.get_value(key, bool, section)
799
-
800
-
801
729
  _registered_clients: dict[str, ApiClient] = {}
802
730
 
803
731
 
@@ -852,7 +780,8 @@ class RateLimitsInfo:
852
780
  # TODO: remove
853
781
  for info in self.resource_limits.values():
854
782
  _logger.debug(
855
- f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
783
+ f'Init {info.resource} rate limit: rem={info.remaining} '
784
+ f'reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
856
785
  )
857
786
  else:
858
787
  for k, v in kwargs.items():
@@ -886,5 +815,6 @@ class RateLimitInfo:
886
815
  self.reset_at = reset_at
887
816
  # TODO: remove
888
817
  _logger.debug(
889
- f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
818
+ f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} '
819
+ f'reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
890
820
  )
@@ -29,7 +29,7 @@ class AggregationNode(ExecNode):
29
29
 
30
30
  def __init__(
31
31
  self,
32
- tbl: catalog.TableVersion,
32
+ tbl: catalog.TableVersionHandle,
33
33
  row_builder: exprs.RowBuilder,
34
34
  group_by: Optional[list[exprs.Expr]],
35
35
  agg_fn_calls: list[exprs.FunctionCall],
@@ -86,9 +86,10 @@ class AggregationNode(ExecNode):
86
86
  self._reset_agg_state(0)
87
87
  self._update_agg_state(row, 0)
88
88
  prev_row = row
89
- # emit the last group
90
- self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
91
- self.output_batch.add_row(prev_row)
89
+ if prev_row is not None:
90
+ # emit the last group
91
+ self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
92
+ self.output_batch.add_row(prev_row)
92
93
 
93
94
  self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
94
95
  _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
@@ -40,7 +40,7 @@ class CachePrefetchNode(ExecNode):
40
40
  boto_client_lock: threading.Lock
41
41
 
42
42
  # execution state
43
- batch_tbl_version: Optional[catalog.TableVersion] # needed to construct output batches
43
+ batch_tbl_version: Optional[catalog.TableVersionHandle] # needed to construct output batches
44
44
  num_returned_rows: int
45
45
 
46
46
  # ready_rows: rows that are ready to be returned, ordered by row idx;
@@ -14,23 +14,25 @@ class ComponentIterationNode(ExecNode):
14
14
  Returns row batches of OUTPUT_BATCH_SIZE size.
15
15
  """
16
16
 
17
+ view: catalog.TableVersionHandle
18
+
17
19
  __OUTPUT_BATCH_SIZE = 1024
18
20
 
19
- def __init__(self, view: catalog.TableVersion, input: ExecNode):
20
- assert view.is_component_view()
21
+ def __init__(self, view: catalog.TableVersionHandle, input: ExecNode):
22
+ assert view.get().is_component_view
21
23
  super().__init__(input.row_builder, [], [], input)
22
24
  self.view = view
23
- iterator_args = [view.iterator_args.copy()]
25
+ iterator_args = [view.get().iterator_args.copy()]
24
26
  self.row_builder.set_slot_idxs(iterator_args)
25
27
  self.iterator_args = iterator_args[0]
26
28
  assert isinstance(self.iterator_args, exprs.InlineDict)
27
29
  self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
28
- self.iterator_output_schema, self.unstored_column_names = self.view.iterator_cls.output_schema(
30
+ self.iterator_output_schema, self.unstored_column_names = self.view.get().iterator_cls.output_schema(
29
31
  **self.iterator_args.to_kwargs()
30
32
  )
31
33
  self.iterator_output_fields = list(self.iterator_output_schema.keys())
32
34
  self.iterator_output_cols = {
33
- field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields
35
+ field_name: self.view.get().cols_by_name[field_name] for field_name in self.iterator_output_fields
34
36
  }
35
37
  # referenced iterator output fields
36
38
  self.refd_output_slot_idxs = {
@@ -50,7 +52,7 @@ class ComponentIterationNode(ExecNode):
50
52
  # specified and are not null. If any of them are null, then we skip this row (i.e., we emit 0
51
53
  # output rows for this input row).
52
54
  if self.__non_nullable_args_specified(iterator_args):
53
- iterator = self.view.iterator_cls(**iterator_args)
55
+ iterator = self.view.get().iterator_cls(**iterator_args)
54
56
  for pos, component_dict in enumerate(iterator):
55
57
  output_row = output_batch.add_row()
56
58
  input_row.copy(output_row)
@@ -67,7 +69,7 @@ class ComponentIterationNode(ExecNode):
67
69
  """
68
70
  Returns true if all non-nullable iterator arguments are not `None`.
69
71
  """
70
- input_schema = self.view.iterator_cls.input_schema()
72
+ input_schema = self.view.get().iterator_cls.input_schema()
71
73
  for arg_name, arg_value in iterator_args.items():
72
74
  col_type = input_schema[arg_name]
73
75
  if arg_value is None and not col_type.nullable:
@@ -80,7 +82,9 @@ class ComponentIterationNode(ExecNode):
80
82
  # verify and copy component_dict fields to their respective slots in output_row
81
83
  for field_name, field_val in component_dict.items():
82
84
  if field_name not in self.iterator_output_fields:
83
- raise excs.Error(f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
85
+ raise excs.Error(
86
+ f'Invalid field name {field_name} in output of {self.view.get().iterator_cls.__name__}'
87
+ )
84
88
  if field_name not in self.refd_output_slot_idxs:
85
89
  # we can ignore this
86
90
  continue
@@ -90,5 +94,5 @@ class ComponentIterationNode(ExecNode):
90
94
  if len(component_dict) != len(self.iterator_output_fields):
91
95
  missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
92
96
  raise excs.Error(
93
- f'Invalid output of {self.view.iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
97
+ f'Invalid output of {self.view.get().iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
94
98
  )
@@ -16,7 +16,7 @@ class DataRowBatch:
16
16
  Contains the metadata needed to initialize DataRows.
17
17
  """
18
18
 
19
- tbl: Optional[catalog.TableVersion]
19
+ tbl: Optional[catalog.TableVersionHandle]
20
20
  row_builder: exprs.RowBuilder
21
21
  img_slot_idxs: list[int]
22
22
  media_slot_idxs: list[int] # non-image media slots
@@ -25,7 +25,7 @@ class DataRowBatch:
25
25
 
26
26
  def __init__(
27
27
  self,
28
- tbl: Optional[catalog.TableVersion],
28
+ tbl: Optional[catalog.TableVersionHandle],
29
29
  row_builder: exprs.RowBuilder,
30
30
  num_rows: Optional[int] = None,
31
31
  rows: Optional[list[exprs.DataRow]] = None,
@@ -91,7 +91,7 @@ class DataRowBatch:
91
91
  idx_range = slice(0, len(self.rows))
92
92
  for row in self.rows[idx_range]:
93
93
  for info in stored_img_info:
94
- filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.version))
94
+ filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.get().version))
95
95
  row.flush_img(info.slot_idx, filepath)
96
96
  for slot_idx in flushed_slot_idxs:
97
97
  row.flush_img(slot_idx)
@@ -28,7 +28,3 @@ class ExecContext:
28
28
  self.pk_clause = pk_clause
29
29
  self.num_computed_exprs = num_computed_exprs
30
30
  self.ignore_errors = ignore_errors
31
-
32
- def set_conn(self, conn: sql.engine.Connection) -> None:
33
- self.conn = conn
34
- self.row_builder.set_conn(conn)
@@ -6,7 +6,7 @@ import logging
6
6
  import sys
7
7
  from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
8
8
 
9
- import pixeltable.exprs as exprs
9
+ from pixeltable import exprs
10
10
 
11
11
  from .data_row_batch import DataRowBatch
12
12
  from .exec_context import ExecContext
@@ -31,6 +31,7 @@ class ExecNode(abc.ABC):
31
31
  input_exprs: Iterable[exprs.Expr],
32
32
  input: Optional[ExecNode] = None,
33
33
  ):
34
+ assert all(expr.is_valid for expr in output_exprs)
34
35
  self.output_exprs = output_exprs
35
36
  self.row_builder = row_builder
36
37
  self.input = input
@@ -65,7 +66,7 @@ class ExecNode(abc.ABC):
65
66
  # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
66
67
  # multiple run_until_complete()
67
68
  running_loop = asyncio.get_running_loop()
68
- import nest_asyncio # type: ignore
69
+ import nest_asyncio # type: ignore[import-untyped]
69
70
 
70
71
  nest_asyncio.apply()
71
72
  loop = running_loop
@@ -9,6 +9,7 @@ import time
9
9
  from typing import Awaitable, Collection, Optional
10
10
 
11
11
  from pixeltable import env, func
12
+ from pixeltable.config import Config
12
13
 
13
14
  from .globals import Dispatcher, FnCallArgs, Scheduler
14
15
 
@@ -276,7 +277,7 @@ class RequestRateScheduler(Scheduler):
276
277
  _, endpoint, model = elems
277
278
  section = f'{endpoint}.rate_limits'
278
279
  key = model
279
- requests_per_min = env.Env.get().config.get_int_value(key, section=section)
280
+ requests_per_min = Config.get().get_int_value(key, section=section)
280
281
  requests_per_min = requests_per_min or self.DEFAULT_RATE_LIMIT
281
282
  self.secs_per_request = 1 / (requests_per_min / 60)
282
283