pixeltable 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +5 -3
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -0
- pixeltable/catalog/catalog.py +335 -128
- pixeltable/catalog/column.py +21 -5
- pixeltable/catalog/dir.py +19 -6
- pixeltable/catalog/insertable_table.py +34 -37
- pixeltable/catalog/named_function.py +0 -4
- pixeltable/catalog/schema_object.py +28 -42
- pixeltable/catalog/table.py +195 -158
- pixeltable/catalog/table_version.py +187 -232
- pixeltable/catalog/table_version_handle.py +50 -0
- pixeltable/catalog/table_version_path.py +49 -33
- pixeltable/catalog/view.py +56 -96
- pixeltable/config.py +103 -0
- pixeltable/dataframe.py +90 -90
- pixeltable/env.py +98 -168
- pixeltable/exec/aggregation_node.py +5 -4
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/component_iteration_node.py +13 -9
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +0 -4
- pixeltable/exec/exec_node.py +3 -2
- pixeltable/exec/expr_eval/schedulers.py +2 -1
- pixeltable/exec/in_memory_data_node.py +9 -4
- pixeltable/exec/row_update_node.py +1 -2
- pixeltable/exec/sql_node.py +20 -16
- pixeltable/exprs/column_ref.py +9 -9
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/data_row.py +4 -4
- pixeltable/exprs/expr.py +20 -5
- pixeltable/exprs/function_call.py +98 -58
- pixeltable/exprs/json_mapper.py +25 -8
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/object_ref.py +16 -5
- pixeltable/exprs/row_builder.py +15 -15
- pixeltable/exprs/rowid_ref.py +21 -7
- pixeltable/func/__init__.py +1 -1
- pixeltable/func/function.py +38 -6
- pixeltable/func/query_template_function.py +3 -6
- pixeltable/func/tools.py +26 -26
- pixeltable/func/udf.py +1 -1
- pixeltable/functions/__init__.py +2 -0
- pixeltable/functions/anthropic.py +9 -3
- pixeltable/functions/fireworks.py +7 -4
- pixeltable/functions/globals.py +4 -5
- pixeltable/functions/huggingface.py +1 -5
- pixeltable/functions/image.py +17 -7
- pixeltable/functions/llama_cpp.py +1 -1
- pixeltable/functions/mistralai.py +1 -1
- pixeltable/functions/ollama.py +4 -4
- pixeltable/functions/openai.py +26 -23
- pixeltable/functions/string.py +23 -30
- pixeltable/functions/timestamp.py +11 -6
- pixeltable/functions/together.py +14 -12
- pixeltable/functions/util.py +1 -1
- pixeltable/functions/video.py +5 -4
- pixeltable/functions/vision.py +6 -9
- pixeltable/functions/whisper.py +3 -3
- pixeltable/globals.py +246 -260
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +1 -1
- pixeltable/index/btree.py +3 -1
- pixeltable/index/embedding_index.py +11 -5
- pixeltable/io/external_store.py +11 -12
- pixeltable/io/label_studio.py +4 -3
- pixeltable/io/parquet.py +57 -56
- pixeltable/iterators/__init__.py +4 -2
- pixeltable/iterators/audio.py +11 -11
- pixeltable/iterators/document.py +10 -10
- pixeltable/iterators/string.py +1 -2
- pixeltable/iterators/video.py +14 -15
- pixeltable/metadata/__init__.py +9 -5
- pixeltable/metadata/converters/convert_10.py +0 -1
- pixeltable/metadata/converters/convert_15.py +0 -2
- pixeltable/metadata/converters/convert_23.py +0 -2
- pixeltable/metadata/converters/convert_24.py +3 -3
- pixeltable/metadata/converters/convert_25.py +1 -1
- pixeltable/metadata/converters/convert_27.py +0 -2
- pixeltable/metadata/converters/convert_28.py +0 -2
- pixeltable/metadata/converters/convert_29.py +7 -8
- pixeltable/metadata/converters/util.py +7 -7
- pixeltable/metadata/schema.py +27 -19
- pixeltable/plan.py +68 -40
- pixeltable/share/packager.py +12 -9
- pixeltable/store.py +37 -38
- pixeltable/type_system.py +41 -28
- pixeltable/utils/filecache.py +2 -1
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/METADATA +1 -1
- pixeltable-0.3.7.dist-info/RECORD +174 -0
- pixeltable-0.3.5.dist-info/RECORD +0 -172
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.5.dist-info → pixeltable-0.3.7.dist-info}/entry_points.txt +0 -0
pixeltable/env.py
CHANGED
|
@@ -16,19 +16,19 @@ import threading
|
|
|
16
16
|
import uuid
|
|
17
17
|
import warnings
|
|
18
18
|
from abc import abstractmethod
|
|
19
|
+
from contextlib import contextmanager
|
|
19
20
|
from dataclasses import dataclass, field
|
|
20
21
|
from pathlib import Path
|
|
21
22
|
from sys import stdout
|
|
22
|
-
from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar
|
|
23
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
|
|
23
24
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
24
25
|
|
|
25
26
|
import pixeltable_pgserver
|
|
26
27
|
import sqlalchemy as sql
|
|
27
|
-
import toml
|
|
28
28
|
from tqdm import TqdmWarning
|
|
29
29
|
|
|
30
|
-
import
|
|
31
|
-
from pixeltable import
|
|
30
|
+
from pixeltable import exceptions as excs
|
|
31
|
+
from pixeltable.config import Config
|
|
32
32
|
from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
|
|
33
33
|
from pixeltable.utils.http_server import make_server
|
|
34
34
|
|
|
@@ -47,9 +47,9 @@ class Env:
|
|
|
47
47
|
"""
|
|
48
48
|
|
|
49
49
|
_instance: Optional[Env] = None
|
|
50
|
+
__initializing: bool = False
|
|
50
51
|
_log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
|
|
51
52
|
|
|
52
|
-
_home: Optional[Path]
|
|
53
53
|
_media_dir: Optional[Path]
|
|
54
54
|
_file_cache_dir: Optional[Path] # cached media files with external URL
|
|
55
55
|
_dataset_cache_dir: Optional[Path] # cached datasets (eg, pytorch or COCO)
|
|
@@ -69,19 +69,18 @@ class Env:
|
|
|
69
69
|
_httpd: Optional[http.server.HTTPServer]
|
|
70
70
|
_http_address: Optional[str]
|
|
71
71
|
_logger: logging.Logger
|
|
72
|
-
_console_logger: ConsoleLogger
|
|
73
72
|
_default_log_level: int
|
|
74
73
|
_logfilename: Optional[str]
|
|
75
74
|
_log_to_stdout: bool
|
|
76
75
|
_module_log_level: dict[str, int] # module name -> log level
|
|
77
|
-
_config_file: Optional[Path]
|
|
78
|
-
_config: Optional[Config]
|
|
79
76
|
_file_cache_size_g: float
|
|
80
77
|
_pxt_api_key: Optional[str]
|
|
81
78
|
_stdout_handler: logging.StreamHandler
|
|
82
79
|
_initialized: bool
|
|
83
80
|
|
|
84
81
|
_resource_pool_info: dict[str, Any]
|
|
82
|
+
_current_conn: Optional[sql.Connection]
|
|
83
|
+
_current_session: Optional[sql.orm.Session]
|
|
85
84
|
|
|
86
85
|
@classmethod
|
|
87
86
|
def get(cls) -> Env:
|
|
@@ -91,15 +90,17 @@ class Env:
|
|
|
91
90
|
|
|
92
91
|
@classmethod
|
|
93
92
|
def _init_env(cls, reinit_db: bool = False) -> None:
|
|
93
|
+
assert not cls.__initializing, 'Circular env initialization detected.'
|
|
94
|
+
cls.__initializing = True
|
|
94
95
|
env = Env()
|
|
95
96
|
env._set_up(reinit_db=reinit_db)
|
|
96
97
|
env._upgrade_metadata()
|
|
97
98
|
cls._instance = env
|
|
99
|
+
cls.__initializing = False
|
|
98
100
|
|
|
99
101
|
def __init__(self):
|
|
100
102
|
assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
|
|
101
103
|
|
|
102
|
-
self._home = None
|
|
103
104
|
self._media_dir = None # computed media files
|
|
104
105
|
self._file_cache_dir = None # cached media files with external URL
|
|
105
106
|
self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
|
|
@@ -127,21 +128,14 @@ class Env:
|
|
|
127
128
|
self._log_to_stdout = False
|
|
128
129
|
self._module_log_level = {} # module name -> log level
|
|
129
130
|
|
|
130
|
-
# config
|
|
131
|
-
self._config_file = None
|
|
132
|
-
self._config = None
|
|
133
|
-
|
|
134
131
|
# create logging handler to also log to stdout
|
|
135
132
|
self._stdout_handler = logging.StreamHandler(stream=sys.stdout)
|
|
136
133
|
self._stdout_handler.setFormatter(logging.Formatter(self._log_fmt_str))
|
|
137
134
|
self._initialized = False
|
|
138
135
|
|
|
139
136
|
self._resource_pool_info = {}
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
def config(self) -> Config:
|
|
143
|
-
assert self._config is not None
|
|
144
|
-
return self._config
|
|
137
|
+
self._current_conn = None
|
|
138
|
+
self._current_session = None
|
|
145
139
|
|
|
146
140
|
@property
|
|
147
141
|
def db_url(self) -> str:
|
|
@@ -166,6 +160,33 @@ class Env:
|
|
|
166
160
|
self.engine.dispose()
|
|
167
161
|
self._create_engine(time_zone_name=tz_name)
|
|
168
162
|
|
|
163
|
+
@property
|
|
164
|
+
def conn(self) -> Optional[sql.Connection]:
|
|
165
|
+
assert self._current_conn is not None
|
|
166
|
+
return self._current_conn
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def session(self) -> Optional[sql.orm.Session]:
|
|
170
|
+
assert self._current_session is not None
|
|
171
|
+
return self._current_session
|
|
172
|
+
|
|
173
|
+
@contextmanager
|
|
174
|
+
def begin_xact(self) -> Iterator[sql.Connection]:
|
|
175
|
+
"""Return a context manager that yields a connection to the database. Idempotent."""
|
|
176
|
+
if self._current_conn is None:
|
|
177
|
+
assert self._current_session is None
|
|
178
|
+
with self.engine.begin() as conn, sql.orm.Session(conn) as session:
|
|
179
|
+
self._current_conn = conn
|
|
180
|
+
self._current_session = session
|
|
181
|
+
try:
|
|
182
|
+
yield conn
|
|
183
|
+
finally:
|
|
184
|
+
self._current_session = None
|
|
185
|
+
self._current_conn = None
|
|
186
|
+
else:
|
|
187
|
+
assert self._current_session is not None
|
|
188
|
+
yield self._current_conn
|
|
189
|
+
|
|
169
190
|
def configure_logging(
|
|
170
191
|
self,
|
|
171
192
|
*,
|
|
@@ -233,10 +254,7 @@ class Env:
|
|
|
233
254
|
for module_name in path_parts[:max_idx]:
|
|
234
255
|
if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
|
|
235
256
|
return True
|
|
236
|
-
|
|
237
|
-
return True
|
|
238
|
-
else:
|
|
239
|
-
return False
|
|
257
|
+
return record.levelno >= self._default_log_level
|
|
240
258
|
|
|
241
259
|
@property
|
|
242
260
|
def console_logger(self) -> ConsoleLogger:
|
|
@@ -248,28 +266,14 @@ class Env:
|
|
|
248
266
|
|
|
249
267
|
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
|
250
268
|
|
|
269
|
+
config = Config.get()
|
|
270
|
+
|
|
251
271
|
self._initialized = True
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
self.
|
|
255
|
-
self.
|
|
256
|
-
self.
|
|
257
|
-
self._file_cache_dir = self._home / 'file_cache'
|
|
258
|
-
self._dataset_cache_dir = self._home / 'dataset_cache'
|
|
259
|
-
self._log_dir = self._home / 'logs'
|
|
260
|
-
self._tmp_dir = self._home / 'tmp'
|
|
261
|
-
|
|
262
|
-
if self._home.exists() and not self._home.is_dir():
|
|
263
|
-
raise RuntimeError(f'{self._home} is not a directory')
|
|
264
|
-
|
|
265
|
-
if not self._home.exists():
|
|
266
|
-
# we don't have our logger set up yet, so print to stdout
|
|
267
|
-
print(f'Creating a Pixeltable instance at: {self._home}')
|
|
268
|
-
self._home.mkdir()
|
|
269
|
-
# TODO (aaron-siegel) This is the existing behavior, but it seems scary. If something happens to
|
|
270
|
-
# self._home, it will cause the DB to be destroyed even if pgdata is in an alternate location.
|
|
271
|
-
# PROPOSAL: require `reinit_db` to be set explicitly to destroy the DB.
|
|
272
|
-
reinit_db = True
|
|
272
|
+
self._media_dir = Config.get().home / 'media'
|
|
273
|
+
self._file_cache_dir = Config.get().home / 'file_cache'
|
|
274
|
+
self._dataset_cache_dir = Config.get().home / 'dataset_cache'
|
|
275
|
+
self._log_dir = Config.get().home / 'logs'
|
|
276
|
+
self._tmp_dir = Config.get().home / 'tmp'
|
|
273
277
|
|
|
274
278
|
if not self._media_dir.exists():
|
|
275
279
|
self._media_dir.mkdir()
|
|
@@ -282,26 +286,24 @@ class Env:
|
|
|
282
286
|
if not self._tmp_dir.exists():
|
|
283
287
|
self._tmp_dir.mkdir()
|
|
284
288
|
|
|
285
|
-
|
|
286
|
-
self._config = Config.from_file(self._config_file)
|
|
287
|
-
self._file_cache_size_g = self._config.get_float_value('file_cache_size_g')
|
|
289
|
+
self._file_cache_size_g = config.get_float_value('file_cache_size_g')
|
|
288
290
|
if self._file_cache_size_g is None:
|
|
289
291
|
raise excs.Error(
|
|
290
292
|
'pixeltable/file_cache_size_g is missing from configuration\n'
|
|
291
|
-
f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {
|
|
293
|
+
f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {Config.get().config_file},\n'
|
|
292
294
|
'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
|
|
293
295
|
)
|
|
294
|
-
self._pxt_api_key =
|
|
296
|
+
self._pxt_api_key = config.get_string_value('api_key')
|
|
295
297
|
|
|
296
298
|
# Disable spurious warnings
|
|
297
299
|
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
298
|
-
if
|
|
300
|
+
if config.get_bool_value('hide_warnings'):
|
|
299
301
|
# Disable more warnings
|
|
300
302
|
warnings.simplefilter('ignore', category=UserWarning)
|
|
301
303
|
warnings.simplefilter('ignore', category=FutureWarning)
|
|
302
304
|
|
|
303
305
|
# Set verbose level for user visible console messages
|
|
304
|
-
verbosity = map_level(
|
|
306
|
+
verbosity = map_level(config.get_int_value('verbosity'))
|
|
305
307
|
stdout_handler = ConsoleOutputHandler(stream=stdout)
|
|
306
308
|
stdout_handler.setLevel(verbosity)
|
|
307
309
|
stdout_handler.addFilter(ConsoleMessageFilter())
|
|
@@ -339,7 +341,7 @@ class Env:
|
|
|
339
341
|
self.clear_tmp_dir()
|
|
340
342
|
|
|
341
343
|
self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
|
|
342
|
-
self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(
|
|
344
|
+
self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(Config.get().home / 'pgdata')))
|
|
343
345
|
|
|
344
346
|
# cleanup_mode=None will leave the postgres process running after Python exits
|
|
345
347
|
# cleanup_mode='stop' will terminate the postgres process when Python exits
|
|
@@ -349,11 +351,11 @@ class Env:
|
|
|
349
351
|
self._db_server = pixeltable_pgserver.get_server(self._pgdata_dir, cleanup_mode=cleanup_mode)
|
|
350
352
|
self._db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
|
|
351
353
|
|
|
352
|
-
tz_name =
|
|
354
|
+
tz_name = config.get_string_value('time_zone')
|
|
353
355
|
if tz_name is not None:
|
|
354
356
|
# Validate tzname
|
|
355
357
|
if not isinstance(tz_name, str):
|
|
356
|
-
self._logger.error(
|
|
358
|
+
self._logger.error('Invalid time zone specified in configuration.')
|
|
357
359
|
else:
|
|
358
360
|
try:
|
|
359
361
|
_ = ZoneInfo(tz_name)
|
|
@@ -375,9 +377,9 @@ class Env:
|
|
|
375
377
|
self._create_engine(time_zone_name=tz_name, echo=echo)
|
|
376
378
|
|
|
377
379
|
if create_db:
|
|
378
|
-
from pixeltable
|
|
380
|
+
from pixeltable import metadata
|
|
379
381
|
|
|
380
|
-
schema.base_metadata.create_all(self._sa_engine)
|
|
382
|
+
metadata.schema.base_metadata.create_all(self._sa_engine)
|
|
381
383
|
metadata.create_system_info(self._sa_engine)
|
|
382
384
|
|
|
383
385
|
self.console_logger.info(f'Connected to Pixeltable database at: {self.db_url}')
|
|
@@ -460,6 +462,8 @@ class Env:
|
|
|
460
462
|
engine.dispose()
|
|
461
463
|
|
|
462
464
|
def _upgrade_metadata(self) -> None:
|
|
465
|
+
from pixeltable import metadata
|
|
466
|
+
|
|
463
467
|
metadata.upgrade_md(self._sa_engine)
|
|
464
468
|
|
|
465
469
|
@property
|
|
@@ -467,7 +471,7 @@ class Env:
|
|
|
467
471
|
if self._pxt_api_key is None:
|
|
468
472
|
raise excs.Error(
|
|
469
473
|
'No API key is configured. Set the PIXELTABLE_API_KEY environment variable, or add an entry to '
|
|
470
|
-
|
|
474
|
+
'config.toml as described here:\nhttps://pixeltable.github.io/pixeltable/config/'
|
|
471
475
|
)
|
|
472
476
|
return self._pxt_api_key
|
|
473
477
|
|
|
@@ -486,14 +490,14 @@ class Env:
|
|
|
486
490
|
|
|
487
491
|
init_kwargs: dict[str, str] = {}
|
|
488
492
|
for param in cl.param_names:
|
|
489
|
-
arg =
|
|
493
|
+
arg = Config.get().get_string_value(param, section=name)
|
|
490
494
|
if arg is not None and len(arg) > 0:
|
|
491
495
|
init_kwargs[param] = arg
|
|
492
496
|
else:
|
|
493
497
|
raise excs.Error(
|
|
494
498
|
f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
|
|
495
|
-
f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable,
|
|
496
|
-
f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
499
|
+
f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, '
|
|
500
|
+
f'or put `{param.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
497
501
|
)
|
|
498
502
|
|
|
499
503
|
cl.client_obj = cl.init_fn(**init_kwargs)
|
|
@@ -526,8 +530,6 @@ class Env:
|
|
|
526
530
|
"""Check for and start runtime services"""
|
|
527
531
|
self._start_web_server()
|
|
528
532
|
self.__register_packages()
|
|
529
|
-
if self.is_installed_package('spacy'):
|
|
530
|
-
self.__init_spacy()
|
|
531
533
|
|
|
532
534
|
def __register_packages(self) -> None:
|
|
533
535
|
"""Declare optional packages that are utilized by some parts of the code."""
|
|
@@ -590,7 +592,8 @@ class Env:
|
|
|
590
592
|
if not package_info.is_installed:
|
|
591
593
|
# Still not found.
|
|
592
594
|
raise excs.Error(
|
|
593
|
-
f'This feature requires the `{package_name}` package. To install it, run:
|
|
595
|
+
f'This feature requires the `{package_name}` package. To install it, run: '
|
|
596
|
+
f'`pip install -U {package_info.library_name}`'
|
|
594
597
|
)
|
|
595
598
|
|
|
596
599
|
if min_version is None:
|
|
@@ -603,41 +606,12 @@ class Env:
|
|
|
603
606
|
|
|
604
607
|
if min_version > package_info.version:
|
|
605
608
|
raise excs.Error(
|
|
606
|
-
f'The installed version of package `{package_name}` is
|
|
609
|
+
f'The installed version of package `{package_name}` is '
|
|
610
|
+
f'{".".join(str(v) for v in package_info.version)}, '
|
|
607
611
|
f'but version >={".".join(str(v) for v in min_version)} is required. '
|
|
608
612
|
f'To fix this, run: `pip install -U {package_info.library_name}`'
|
|
609
613
|
)
|
|
610
614
|
|
|
611
|
-
def __init_spacy(self) -> None:
|
|
612
|
-
"""
|
|
613
|
-
spaCy relies on a pip-installed model to operate. In order to avoid requiring the model as a separate
|
|
614
|
-
dependency, we install it programmatically here. This should cause no problems, since the model packages
|
|
615
|
-
have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
|
|
616
|
-
"""
|
|
617
|
-
import spacy
|
|
618
|
-
from spacy.cli.download import get_model_filename
|
|
619
|
-
|
|
620
|
-
spacy_model = 'en_core_web_sm'
|
|
621
|
-
spacy_model_version = '3.7.1'
|
|
622
|
-
filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
|
|
623
|
-
url = f'{spacy.about.__download_url__}/{filename}'
|
|
624
|
-
# Try to `pip install` the model. We set check=False; if the pip command fails, it's not necessarily
|
|
625
|
-
# a problem, because the model have been installed on a previous attempt.
|
|
626
|
-
self._logger.info(f'Ensuring spaCy model is installed: {filename}')
|
|
627
|
-
ret = subprocess.run([sys.executable, '-m', 'pip', 'install', '-qU', url], check=False)
|
|
628
|
-
if ret.returncode != 0:
|
|
629
|
-
self._logger.warning(f'pip install failed for spaCy model: {filename}')
|
|
630
|
-
try:
|
|
631
|
-
self._logger.info(f'Loading spaCy model: {spacy_model}')
|
|
632
|
-
self._spacy_nlp = spacy.load(spacy_model)
|
|
633
|
-
except Exception as exc:
|
|
634
|
-
self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
|
|
635
|
-
warnings.warn(
|
|
636
|
-
f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
|
|
637
|
-
excs.PixeltableWarning,
|
|
638
|
-
)
|
|
639
|
-
self.__optional_packages['spacy'].is_installed = False
|
|
640
|
-
|
|
641
615
|
def clear_tmp_dir(self) -> None:
|
|
642
616
|
for path in glob.glob(f'{self._tmp_dir}/*'):
|
|
643
617
|
if os.path.isdir(path):
|
|
@@ -660,11 +634,6 @@ class Env:
|
|
|
660
634
|
self._resource_pool_info[pool_id] = info
|
|
661
635
|
return info
|
|
662
636
|
|
|
663
|
-
@property
|
|
664
|
-
def home(self) -> Path:
|
|
665
|
-
assert self._home is not None
|
|
666
|
-
return self._home
|
|
667
|
-
|
|
668
637
|
@property
|
|
669
638
|
def media_dir(self) -> Path:
|
|
670
639
|
assert self._media_dir is not None
|
|
@@ -693,9 +662,36 @@ class Env:
|
|
|
693
662
|
@property
|
|
694
663
|
def spacy_nlp(self) -> spacy.Language:
|
|
695
664
|
Env.get().require_package('spacy')
|
|
665
|
+
if self._spacy_nlp is None:
|
|
666
|
+
self.__init_spacy()
|
|
696
667
|
assert self._spacy_nlp is not None
|
|
697
668
|
return self._spacy_nlp
|
|
698
669
|
|
|
670
|
+
def __init_spacy(self) -> None:
|
|
671
|
+
"""
|
|
672
|
+
spaCy relies on a pip-installed model to operate. In order to avoid requiring the model as a separate
|
|
673
|
+
dependency, we install it programmatically here. This should cause no problems, since the model packages
|
|
674
|
+
have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
|
|
675
|
+
"""
|
|
676
|
+
import spacy
|
|
677
|
+
from spacy.cli.download import get_model_filename
|
|
678
|
+
|
|
679
|
+
spacy_model = 'en_core_web_sm'
|
|
680
|
+
spacy_model_version = '3.7.1'
|
|
681
|
+
filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
|
|
682
|
+
url = f'{spacy.about.__download_url__}/{filename}'
|
|
683
|
+
# Try to `pip install` the model. We set check=False; if the pip command fails, it's not necessarily
|
|
684
|
+
# a problem, because the model might have been installed on a previous attempt.
|
|
685
|
+
self._logger.info(f'Ensuring spaCy model is installed: {filename}')
|
|
686
|
+
ret = subprocess.run([sys.executable, '-m', 'pip', 'install', '-qU', url], check=False)
|
|
687
|
+
if ret.returncode != 0:
|
|
688
|
+
self._logger.warning(f'pip install failed for spaCy model: {filename}')
|
|
689
|
+
self._logger.info(f'Loading spaCy model: {spacy_model}')
|
|
690
|
+
try:
|
|
691
|
+
self._spacy_nlp = spacy.load(spacy_model)
|
|
692
|
+
except Exception as exc:
|
|
693
|
+
raise excs.Error(f'Failed to load spaCy model: {spacy_model}') from exc
|
|
694
|
+
|
|
699
695
|
|
|
700
696
|
def register_client(name: str) -> Callable:
|
|
701
697
|
"""Decorator that registers a third-party API client for use by Pixeltable.
|
|
@@ -723,7 +719,6 @@ def register_client(name: str) -> Callable:
|
|
|
723
719
|
"""
|
|
724
720
|
|
|
725
721
|
def decorator(fn: Callable) -> None:
|
|
726
|
-
global _registered_clients
|
|
727
722
|
sig = inspect.signature(fn)
|
|
728
723
|
param_names = list(sig.parameters.keys())
|
|
729
724
|
_registered_clients[name] = ApiClient(init_fn=fn, param_names=param_names)
|
|
@@ -731,73 +726,6 @@ def register_client(name: str) -> Callable:
|
|
|
731
726
|
return decorator
|
|
732
727
|
|
|
733
728
|
|
|
734
|
-
class Config:
|
|
735
|
-
"""
|
|
736
|
-
The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
|
|
737
|
-
configuration values, which can be set in the config file or as environment variables.
|
|
738
|
-
"""
|
|
739
|
-
|
|
740
|
-
__config: dict[str, Any]
|
|
741
|
-
|
|
742
|
-
@classmethod
|
|
743
|
-
def from_file(cls, path: Path) -> Config:
|
|
744
|
-
"""
|
|
745
|
-
Loads configuration from the specified TOML file. If the file does not exist, it will be
|
|
746
|
-
created and populated with the default configuration.
|
|
747
|
-
"""
|
|
748
|
-
if os.path.isfile(path):
|
|
749
|
-
with open(path, 'r') as stream:
|
|
750
|
-
try:
|
|
751
|
-
config_dict = toml.load(stream)
|
|
752
|
-
except Exception as exc:
|
|
753
|
-
raise excs.Error(f'Could not read config file: {str(path)}') from exc
|
|
754
|
-
else:
|
|
755
|
-
config_dict = cls.__create_default_config(path)
|
|
756
|
-
with open(path, 'w') as stream:
|
|
757
|
-
try:
|
|
758
|
-
toml.dump(config_dict, stream)
|
|
759
|
-
except Exception as exc:
|
|
760
|
-
raise excs.Error(f'Could not write config file: {str(path)}') from exc
|
|
761
|
-
logging.getLogger('pixeltable').info(f'Created default config file at: {str(path)}')
|
|
762
|
-
return cls(config_dict)
|
|
763
|
-
|
|
764
|
-
@classmethod
|
|
765
|
-
def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
|
|
766
|
-
free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
|
|
767
|
-
# Default cache size is 1/5 of free disk space
|
|
768
|
-
file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
|
|
769
|
-
return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
|
|
770
|
-
|
|
771
|
-
def __init__(self, config: dict[str, Any]) -> None:
|
|
772
|
-
self.__config = config
|
|
773
|
-
|
|
774
|
-
def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
|
|
775
|
-
env_var = f'{section.upper()}_{key.upper()}'
|
|
776
|
-
if env_var in os.environ:
|
|
777
|
-
value = os.environ[env_var]
|
|
778
|
-
elif section in self.__config and key in self.__config[section]:
|
|
779
|
-
value = self.__config[section][key]
|
|
780
|
-
else:
|
|
781
|
-
return None
|
|
782
|
-
|
|
783
|
-
try:
|
|
784
|
-
return expected_type(value) # type: ignore[call-arg]
|
|
785
|
-
except ValueError:
|
|
786
|
-
raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
|
|
787
|
-
|
|
788
|
-
def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
|
|
789
|
-
return self.get_value(key, str, section)
|
|
790
|
-
|
|
791
|
-
def get_int_value(self, key: str, section: str = 'pixeltable') -> Optional[int]:
|
|
792
|
-
return self.get_value(key, int, section)
|
|
793
|
-
|
|
794
|
-
def get_float_value(self, key: str, section: str = 'pixeltable') -> Optional[float]:
|
|
795
|
-
return self.get_value(key, float, section)
|
|
796
|
-
|
|
797
|
-
def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
|
|
798
|
-
return self.get_value(key, bool, section)
|
|
799
|
-
|
|
800
|
-
|
|
801
729
|
_registered_clients: dict[str, ApiClient] = {}
|
|
802
730
|
|
|
803
731
|
|
|
@@ -852,7 +780,8 @@ class RateLimitsInfo:
|
|
|
852
780
|
# TODO: remove
|
|
853
781
|
for info in self.resource_limits.values():
|
|
854
782
|
_logger.debug(
|
|
855
|
-
f'Init {info.resource} rate limit: rem={info.remaining}
|
|
783
|
+
f'Init {info.resource} rate limit: rem={info.remaining} '
|
|
784
|
+
f'reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
|
|
856
785
|
)
|
|
857
786
|
else:
|
|
858
787
|
for k, v in kwargs.items():
|
|
@@ -886,5 +815,6 @@ class RateLimitInfo:
|
|
|
886
815
|
self.reset_at = reset_at
|
|
887
816
|
# TODO: remove
|
|
888
817
|
_logger.debug(
|
|
889
|
-
f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)}
|
|
818
|
+
f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} '
|
|
819
|
+
f'reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
|
|
890
820
|
)
|
|
@@ -29,7 +29,7 @@ class AggregationNode(ExecNode):
|
|
|
29
29
|
|
|
30
30
|
def __init__(
|
|
31
31
|
self,
|
|
32
|
-
tbl: catalog.
|
|
32
|
+
tbl: catalog.TableVersionHandle,
|
|
33
33
|
row_builder: exprs.RowBuilder,
|
|
34
34
|
group_by: Optional[list[exprs.Expr]],
|
|
35
35
|
agg_fn_calls: list[exprs.FunctionCall],
|
|
@@ -86,9 +86,10 @@ class AggregationNode(ExecNode):
|
|
|
86
86
|
self._reset_agg_state(0)
|
|
87
87
|
self._update_agg_state(row, 0)
|
|
88
88
|
prev_row = row
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
89
|
+
if prev_row is not None:
|
|
90
|
+
# emit the last group
|
|
91
|
+
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
92
|
+
self.output_batch.add_row(prev_row)
|
|
92
93
|
|
|
93
94
|
self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
|
|
94
95
|
_logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
|
|
@@ -40,7 +40,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
40
40
|
boto_client_lock: threading.Lock
|
|
41
41
|
|
|
42
42
|
# execution state
|
|
43
|
-
batch_tbl_version: Optional[catalog.
|
|
43
|
+
batch_tbl_version: Optional[catalog.TableVersionHandle] # needed to construct output batches
|
|
44
44
|
num_returned_rows: int
|
|
45
45
|
|
|
46
46
|
# ready_rows: rows that are ready to be returned, ordered by row idx;
|
|
@@ -14,23 +14,25 @@ class ComponentIterationNode(ExecNode):
|
|
|
14
14
|
Returns row batches of OUTPUT_BATCH_SIZE size.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
view: catalog.TableVersionHandle
|
|
18
|
+
|
|
17
19
|
__OUTPUT_BATCH_SIZE = 1024
|
|
18
20
|
|
|
19
|
-
def __init__(self, view: catalog.
|
|
20
|
-
assert view.
|
|
21
|
+
def __init__(self, view: catalog.TableVersionHandle, input: ExecNode):
|
|
22
|
+
assert view.get().is_component_view
|
|
21
23
|
super().__init__(input.row_builder, [], [], input)
|
|
22
24
|
self.view = view
|
|
23
|
-
iterator_args = [view.iterator_args.copy()]
|
|
25
|
+
iterator_args = [view.get().iterator_args.copy()]
|
|
24
26
|
self.row_builder.set_slot_idxs(iterator_args)
|
|
25
27
|
self.iterator_args = iterator_args[0]
|
|
26
28
|
assert isinstance(self.iterator_args, exprs.InlineDict)
|
|
27
29
|
self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
|
|
28
|
-
self.iterator_output_schema, self.unstored_column_names = self.view.iterator_cls.output_schema(
|
|
30
|
+
self.iterator_output_schema, self.unstored_column_names = self.view.get().iterator_cls.output_schema(
|
|
29
31
|
**self.iterator_args.to_kwargs()
|
|
30
32
|
)
|
|
31
33
|
self.iterator_output_fields = list(self.iterator_output_schema.keys())
|
|
32
34
|
self.iterator_output_cols = {
|
|
33
|
-
field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields
|
|
35
|
+
field_name: self.view.get().cols_by_name[field_name] for field_name in self.iterator_output_fields
|
|
34
36
|
}
|
|
35
37
|
# referenced iterator output fields
|
|
36
38
|
self.refd_output_slot_idxs = {
|
|
@@ -50,7 +52,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
50
52
|
# specified and are not null. If any of them are null, then we skip this row (i.e., we emit 0
|
|
51
53
|
# output rows for this input row).
|
|
52
54
|
if self.__non_nullable_args_specified(iterator_args):
|
|
53
|
-
iterator = self.view.iterator_cls(**iterator_args)
|
|
55
|
+
iterator = self.view.get().iterator_cls(**iterator_args)
|
|
54
56
|
for pos, component_dict in enumerate(iterator):
|
|
55
57
|
output_row = output_batch.add_row()
|
|
56
58
|
input_row.copy(output_row)
|
|
@@ -67,7 +69,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
67
69
|
"""
|
|
68
70
|
Returns true if all non-nullable iterator arguments are not `None`.
|
|
69
71
|
"""
|
|
70
|
-
input_schema = self.view.iterator_cls.input_schema()
|
|
72
|
+
input_schema = self.view.get().iterator_cls.input_schema()
|
|
71
73
|
for arg_name, arg_value in iterator_args.items():
|
|
72
74
|
col_type = input_schema[arg_name]
|
|
73
75
|
if arg_value is None and not col_type.nullable:
|
|
@@ -80,7 +82,9 @@ class ComponentIterationNode(ExecNode):
|
|
|
80
82
|
# verify and copy component_dict fields to their respective slots in output_row
|
|
81
83
|
for field_name, field_val in component_dict.items():
|
|
82
84
|
if field_name not in self.iterator_output_fields:
|
|
83
|
-
raise excs.Error(
|
|
85
|
+
raise excs.Error(
|
|
86
|
+
f'Invalid field name {field_name} in output of {self.view.get().iterator_cls.__name__}'
|
|
87
|
+
)
|
|
84
88
|
if field_name not in self.refd_output_slot_idxs:
|
|
85
89
|
# we can ignore this
|
|
86
90
|
continue
|
|
@@ -90,5 +94,5 @@ class ComponentIterationNode(ExecNode):
|
|
|
90
94
|
if len(component_dict) != len(self.iterator_output_fields):
|
|
91
95
|
missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
|
|
92
96
|
raise excs.Error(
|
|
93
|
-
f'Invalid output of {self.view.iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
|
|
97
|
+
f'Invalid output of {self.view.get().iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
|
|
94
98
|
)
|
|
@@ -16,7 +16,7 @@ class DataRowBatch:
|
|
|
16
16
|
Contains the metadata needed to initialize DataRows.
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
-
tbl: Optional[catalog.
|
|
19
|
+
tbl: Optional[catalog.TableVersionHandle]
|
|
20
20
|
row_builder: exprs.RowBuilder
|
|
21
21
|
img_slot_idxs: list[int]
|
|
22
22
|
media_slot_idxs: list[int] # non-image media slots
|
|
@@ -25,7 +25,7 @@ class DataRowBatch:
|
|
|
25
25
|
|
|
26
26
|
def __init__(
|
|
27
27
|
self,
|
|
28
|
-
tbl: Optional[catalog.
|
|
28
|
+
tbl: Optional[catalog.TableVersionHandle],
|
|
29
29
|
row_builder: exprs.RowBuilder,
|
|
30
30
|
num_rows: Optional[int] = None,
|
|
31
31
|
rows: Optional[list[exprs.DataRow]] = None,
|
|
@@ -91,7 +91,7 @@ class DataRowBatch:
|
|
|
91
91
|
idx_range = slice(0, len(self.rows))
|
|
92
92
|
for row in self.rows[idx_range]:
|
|
93
93
|
for info in stored_img_info:
|
|
94
|
-
filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.version))
|
|
94
|
+
filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.get().version))
|
|
95
95
|
row.flush_img(info.slot_idx, filepath)
|
|
96
96
|
for slot_idx in flushed_slot_idxs:
|
|
97
97
|
row.flush_img(slot_idx)
|
pixeltable/exec/exec_context.py
CHANGED
pixeltable/exec/exec_node.py
CHANGED
|
@@ -6,7 +6,7 @@ import logging
|
|
|
6
6
|
import sys
|
|
7
7
|
from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
from pixeltable import exprs
|
|
10
10
|
|
|
11
11
|
from .data_row_batch import DataRowBatch
|
|
12
12
|
from .exec_context import ExecContext
|
|
@@ -31,6 +31,7 @@ class ExecNode(abc.ABC):
|
|
|
31
31
|
input_exprs: Iterable[exprs.Expr],
|
|
32
32
|
input: Optional[ExecNode] = None,
|
|
33
33
|
):
|
|
34
|
+
assert all(expr.is_valid for expr in output_exprs)
|
|
34
35
|
self.output_exprs = output_exprs
|
|
35
36
|
self.row_builder = row_builder
|
|
36
37
|
self.input = input
|
|
@@ -65,7 +66,7 @@ class ExecNode(abc.ABC):
|
|
|
65
66
|
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
66
67
|
# multiple run_until_complete()
|
|
67
68
|
running_loop = asyncio.get_running_loop()
|
|
68
|
-
import nest_asyncio # type: ignore
|
|
69
|
+
import nest_asyncio # type: ignore[import-untyped]
|
|
69
70
|
|
|
70
71
|
nest_asyncio.apply()
|
|
71
72
|
loop = running_loop
|
|
@@ -9,6 +9,7 @@ import time
|
|
|
9
9
|
from typing import Awaitable, Collection, Optional
|
|
10
10
|
|
|
11
11
|
from pixeltable import env, func
|
|
12
|
+
from pixeltable.config import Config
|
|
12
13
|
|
|
13
14
|
from .globals import Dispatcher, FnCallArgs, Scheduler
|
|
14
15
|
|
|
@@ -276,7 +277,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
276
277
|
_, endpoint, model = elems
|
|
277
278
|
section = f'{endpoint}.rate_limits'
|
|
278
279
|
key = model
|
|
279
|
-
requests_per_min =
|
|
280
|
+
requests_per_min = Config.get().get_int_value(key, section=section)
|
|
280
281
|
requests_per_min = requests_per_min or self.DEFAULT_RATE_LIMIT
|
|
281
282
|
self.secs_per_request = 1 / (requests_per_min / 60)
|
|
282
283
|
|