pixeltable 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/insertable_table.py +9 -7
- pixeltable/catalog/table.py +18 -5
- pixeltable/catalog/table_version.py +1 -1
- pixeltable/catalog/view.py +1 -1
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +140 -40
- pixeltable/exceptions.py +12 -5
- pixeltable/exec/component_iteration_node.py +63 -42
- pixeltable/exprs/__init__.py +1 -2
- pixeltable/exprs/expr.py +5 -6
- pixeltable/exprs/function_call.py +8 -10
- pixeltable/exprs/inline_expr.py +200 -0
- pixeltable/exprs/json_path.py +3 -6
- pixeltable/ext/functions/whisperx.py +2 -0
- pixeltable/ext/functions/yolox.py +5 -3
- pixeltable/functions/huggingface.py +89 -12
- pixeltable/functions/image.py +3 -3
- pixeltable/functions/together.py +37 -16
- pixeltable/functions/vision.py +43 -21
- pixeltable/functions/whisper.py +3 -0
- pixeltable/globals.py +7 -1
- pixeltable/io/globals.py +1 -1
- pixeltable/io/hf_datasets.py +3 -3
- pixeltable/iterators/document.py +1 -1
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_18.py +1 -1
- pixeltable/metadata/converters/convert_20.py +56 -0
- pixeltable/metadata/converters/util.py +29 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/tool/create_test_db_dump.py +15 -4
- pixeltable/type_system.py +3 -1
- pixeltable/utils/filecache.py +126 -79
- pixeltable-0.2.20.dist-info/LICENSE +201 -0
- {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/METADATA +16 -6
- {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/RECORD +39 -39
- pixeltable/exprs/inline_array.py +0 -117
- pixeltable/exprs/inline_dict.py +0 -104
- pixeltable-0.2.18.dist-info/LICENSE +0 -18
- {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.18.dist-info → pixeltable-0.2.20.dist-info}/entry_points.txt +0 -0
pixeltable/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ from .exceptions import Error
|
|
|
4
4
|
from .exprs import RELATIVE_PATH_ROOT
|
|
5
5
|
from .func import Function, udf, Aggregator, uda, expr_udf
|
|
6
6
|
from .globals import init, create_table, create_view, get_table, move, drop_table, list_tables, create_dir, drop_dir, \
|
|
7
|
-
list_dirs, list_functions, configure_logging
|
|
7
|
+
list_dirs, list_functions, configure_logging, array
|
|
8
8
|
from .type_system import (
|
|
9
9
|
ColumnType,
|
|
10
10
|
StringType,
|
pixeltable/__version__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
# These version placeholders will be replaced during build.
|
|
2
|
-
__version__ = "0.2.
|
|
3
|
-
__version_tuple__ = (0, 2,
|
|
2
|
+
__version__ = "0.2.20"
|
|
3
|
+
__version_tuple__ = (0, 2, 20)
|
|
@@ -10,6 +10,7 @@ import pixeltable as pxt
|
|
|
10
10
|
import pixeltable.type_system as ts
|
|
11
11
|
from pixeltable import exceptions as excs
|
|
12
12
|
from pixeltable.env import Env
|
|
13
|
+
from pixeltable.utils.filecache import FileCache
|
|
13
14
|
|
|
14
15
|
from .catalog import Catalog
|
|
15
16
|
from .globals import UpdateStatus
|
|
@@ -101,21 +102,22 @@ class InsertableTable(Table):
|
|
|
101
102
|
if not isinstance(row, dict):
|
|
102
103
|
raise excs.Error('rows must be a list of dictionaries')
|
|
103
104
|
self._validate_input_rows(rows)
|
|
104
|
-
|
|
105
|
+
status = self._tbl_version.insert(rows, None, print_stats=print_stats, fail_on_exception=fail_on_exception)
|
|
105
106
|
|
|
106
|
-
if
|
|
107
|
+
if status.num_excs == 0:
|
|
107
108
|
cols_with_excs_str = ''
|
|
108
109
|
else:
|
|
109
110
|
cols_with_excs_str = \
|
|
110
|
-
f' across {len(
|
|
111
|
-
cols_with_excs_str += f' ({", ".join(
|
|
111
|
+
f' across {len(status.cols_with_excs)} column{"" if len(status.cols_with_excs) == 1 else "s"}'
|
|
112
|
+
cols_with_excs_str += f' ({", ".join(status.cols_with_excs)})'
|
|
112
113
|
msg = (
|
|
113
|
-
f'Inserted {
|
|
114
|
-
f'with {
|
|
114
|
+
f'Inserted {status.num_rows} row{"" if status.num_rows == 1 else "s"} '
|
|
115
|
+
f'with {status.num_excs} error{"" if status.num_excs == 1 else "s"}{cols_with_excs_str}.'
|
|
115
116
|
)
|
|
116
117
|
print(msg)
|
|
117
118
|
_logger.info(f'InsertableTable {self._name}: {msg}')
|
|
118
|
-
|
|
119
|
+
FileCache.get().emit_eviction_warnings()
|
|
120
|
+
return status
|
|
119
121
|
|
|
120
122
|
def _validate_input_rows(self, rows: List[Dict[str, Any]]) -> None:
|
|
121
123
|
"""Verify that the input rows match the table schema"""
|
pixeltable/catalog/table.py
CHANGED
|
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
4
|
import builtins
|
|
5
|
-
import itertools
|
|
6
5
|
import json
|
|
7
6
|
import logging
|
|
8
7
|
from pathlib import Path
|
|
@@ -21,6 +20,7 @@ import pixeltable.exprs as exprs
|
|
|
21
20
|
import pixeltable.index as index
|
|
22
21
|
import pixeltable.metadata.schema as schema
|
|
23
22
|
import pixeltable.type_system as ts
|
|
23
|
+
from pixeltable.utils.filecache import FileCache
|
|
24
24
|
|
|
25
25
|
from .column import Column
|
|
26
26
|
from .globals import _ROWID_COLUMN_NAME, UpdateStatus, is_system_column_name, is_valid_identifier
|
|
@@ -34,7 +34,12 @@ if TYPE_CHECKING:
|
|
|
34
34
|
_logger = logging.getLogger('pixeltable')
|
|
35
35
|
|
|
36
36
|
class Table(SchemaObject):
|
|
37
|
-
"""
|
|
37
|
+
"""
|
|
38
|
+
Base class for table objects (base tables, views, snapshots).
|
|
39
|
+
|
|
40
|
+
Every user-invoked operation that runs an ExecNode tree (directly or indirectly) needs to call
|
|
41
|
+
FileCache.emit_eviction_warnings() at the end of the operation.
|
|
42
|
+
"""
|
|
38
43
|
|
|
39
44
|
def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath):
|
|
40
45
|
super().__init__(id, name, dir_id)
|
|
@@ -375,7 +380,10 @@ class Table(SchemaObject):
|
|
|
375
380
|
|
|
376
381
|
new_col = self._create_columns({col_name: col_schema})[0]
|
|
377
382
|
self._verify_column(new_col, set(self._schema.keys()), set(self._query_names))
|
|
378
|
-
|
|
383
|
+
status = self._tbl_version.add_column(new_col, print_stats=print_stats)
|
|
384
|
+
FileCache.get().emit_eviction_warnings()
|
|
385
|
+
return status
|
|
386
|
+
|
|
379
387
|
|
|
380
388
|
@classmethod
|
|
381
389
|
def _validate_column_spec(cls, name: str, spec: dict[str, Any]) -> None:
|
|
@@ -588,6 +596,7 @@ class Table(SchemaObject):
|
|
|
588
596
|
idx = EmbeddingIndex(col, metric=metric, string_embed=string_embed, image_embed=image_embed)
|
|
589
597
|
status = self._tbl_version.add_index(col, idx_name=idx_name, idx=idx)
|
|
590
598
|
# TODO: how to deal with exceptions here? drop the index and raise?
|
|
599
|
+
FileCache.get().emit_eviction_warnings()
|
|
591
600
|
|
|
592
601
|
def drop_embedding_index(self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None) -> None:
|
|
593
602
|
"""Drop an embedding index from the table.
|
|
@@ -733,7 +742,9 @@ class Table(SchemaObject):
|
|
|
733
742
|
>>> tbl.update({'int_col': tbl.int_col + 1}, where=tbl.int_col == 0)
|
|
734
743
|
"""
|
|
735
744
|
self._check_is_dropped()
|
|
736
|
-
|
|
745
|
+
status = self._tbl_version.update(value_spec, where, cascade)
|
|
746
|
+
FileCache.get().emit_eviction_warnings()
|
|
747
|
+
return status
|
|
737
748
|
|
|
738
749
|
def batch_update(
|
|
739
750
|
self, rows: Iterable[dict[str, Any]], cascade: bool = True,
|
|
@@ -790,9 +801,11 @@ class Table(SchemaObject):
|
|
|
790
801
|
missing_cols = pk_col_names - set(col.name for col in col_vals.keys())
|
|
791
802
|
raise excs.Error(f'Primary key columns ({", ".join(missing_cols)}) missing in {row_spec}')
|
|
792
803
|
row_updates.append(col_vals)
|
|
793
|
-
|
|
804
|
+
status = self._tbl_version.batch_update(
|
|
794
805
|
row_updates, rowids, error_if_not_exists=if_not_exists == 'error',
|
|
795
806
|
insert_if_not_exists=if_not_exists == 'insert', cascade=cascade)
|
|
807
|
+
FileCache.get().emit_eviction_warnings()
|
|
808
|
+
return status
|
|
796
809
|
|
|
797
810
|
def delete(self, where: Optional['pixeltable.exprs.Expr'] = None) -> UpdateStatus:
|
|
798
811
|
"""Delete rows in this table.
|
|
@@ -147,7 +147,7 @@ class TableVersion:
|
|
|
147
147
|
module = importlib.import_module(module_name)
|
|
148
148
|
self.iterator_cls = getattr(module, class_name)
|
|
149
149
|
self.iterator_args = exprs.InlineDict.from_dict(tbl_md.view_md.iterator_args)
|
|
150
|
-
output_schema, _ = self.iterator_cls.output_schema(**self.iterator_args.
|
|
150
|
+
output_schema, _ = self.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
|
|
151
151
|
self.num_iterator_cols = len(output_schema)
|
|
152
152
|
assert tbl_md.view_md.iterator_args is not None
|
|
153
153
|
|
pixeltable/catalog/view.py
CHANGED
|
@@ -94,7 +94,7 @@ class View(Table):
|
|
|
94
94
|
]
|
|
95
95
|
sig = func.Signature(InvalidType(), params)
|
|
96
96
|
from pixeltable.exprs import FunctionCall
|
|
97
|
-
FunctionCall.normalize_args(sig, bound_args)
|
|
97
|
+
FunctionCall.normalize_args(iterator_cls.__name__, sig, bound_args)
|
|
98
98
|
except TypeError as e:
|
|
99
99
|
raise Error(f'Cannot instantiate iterator with given arguments: {e}')
|
|
100
100
|
|
pixeltable/dataframe.py
CHANGED
|
@@ -501,7 +501,7 @@ class DataFrame:
|
|
|
501
501
|
elif isinstance(raw_expr, dict):
|
|
502
502
|
select_list.append((exprs.InlineDict(raw_expr), name))
|
|
503
503
|
elif isinstance(raw_expr, list):
|
|
504
|
-
select_list.append((exprs.
|
|
504
|
+
select_list.append((exprs.InlineList(raw_expr), name))
|
|
505
505
|
else:
|
|
506
506
|
select_list.append((exprs.Literal(raw_expr), name))
|
|
507
507
|
expr = select_list[-1][0]
|
pixeltable/env.py
CHANGED
|
@@ -8,18 +8,20 @@ import importlib.util
|
|
|
8
8
|
import inspect
|
|
9
9
|
import logging
|
|
10
10
|
import os
|
|
11
|
+
import shutil
|
|
12
|
+
import subprocess
|
|
11
13
|
import sys
|
|
12
14
|
import threading
|
|
13
15
|
import uuid
|
|
14
16
|
import warnings
|
|
15
17
|
from dataclasses import dataclass
|
|
16
18
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Callable, Optional
|
|
19
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar
|
|
18
20
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
19
21
|
|
|
20
22
|
import pixeltable_pgserver
|
|
21
23
|
import sqlalchemy as sql
|
|
22
|
-
import
|
|
24
|
+
import toml
|
|
23
25
|
from tqdm import TqdmWarning
|
|
24
26
|
|
|
25
27
|
import pixeltable.exceptions as excs
|
|
@@ -63,7 +65,7 @@ class Env:
|
|
|
63
65
|
_log_to_stdout: bool
|
|
64
66
|
_module_log_level: dict[str, int] # module name -> log level
|
|
65
67
|
_config_file: Optional[Path]
|
|
66
|
-
_config: Optional[
|
|
68
|
+
_config: Optional[Config]
|
|
67
69
|
_stdout_handler: logging.StreamHandler
|
|
68
70
|
_initialized: bool
|
|
69
71
|
|
|
@@ -109,6 +111,7 @@ class Env:
|
|
|
109
111
|
self._log_to_stdout = False
|
|
110
112
|
self._module_log_level = {} # module name -> log level
|
|
111
113
|
|
|
114
|
+
# config
|
|
112
115
|
self._config_file = None
|
|
113
116
|
self._config = None
|
|
114
117
|
|
|
@@ -118,7 +121,8 @@ class Env:
|
|
|
118
121
|
self._initialized = False
|
|
119
122
|
|
|
120
123
|
@property
|
|
121
|
-
def config(self):
|
|
124
|
+
def config(self) -> Config:
|
|
125
|
+
assert self._config is not None
|
|
122
126
|
return self._config
|
|
123
127
|
|
|
124
128
|
@property
|
|
@@ -226,30 +230,13 @@ class Env:
|
|
|
226
230
|
home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
|
|
227
231
|
assert self._home is None or self._home == home
|
|
228
232
|
self._home = home
|
|
229
|
-
self._config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self._home / 'config.
|
|
233
|
+
self._config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self._home / 'config.toml')))
|
|
230
234
|
self._media_dir = self._home / 'media'
|
|
231
235
|
self._file_cache_dir = self._home / 'file_cache'
|
|
232
236
|
self._dataset_cache_dir = self._home / 'dataset_cache'
|
|
233
237
|
self._log_dir = self._home / 'logs'
|
|
234
238
|
self._tmp_dir = self._home / 'tmp'
|
|
235
239
|
|
|
236
|
-
# Read in the config
|
|
237
|
-
if os.path.isfile(self._config_file):
|
|
238
|
-
with open(self._config_file, 'r') as stream:
|
|
239
|
-
try:
|
|
240
|
-
self._config = yaml.safe_load(stream)
|
|
241
|
-
except yaml.YAMLError as exc:
|
|
242
|
-
self._logger.error(f'Could not read config file: {self._config_file}')
|
|
243
|
-
self._config = {}
|
|
244
|
-
else:
|
|
245
|
-
self._config = {}
|
|
246
|
-
|
|
247
|
-
# Disable spurious warnings
|
|
248
|
-
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
249
|
-
if 'hide_warnings' in self._config and self._config['hide_warnings']:
|
|
250
|
-
# Disable more warnings
|
|
251
|
-
warnings.simplefilter('ignore', category=UserWarning)
|
|
252
|
-
|
|
253
240
|
if self._home.exists() and not self._home.is_dir():
|
|
254
241
|
raise RuntimeError(f'{self._home} is not a directory')
|
|
255
242
|
|
|
@@ -273,6 +260,22 @@ class Env:
|
|
|
273
260
|
if not self._tmp_dir.exists():
|
|
274
261
|
self._tmp_dir.mkdir()
|
|
275
262
|
|
|
263
|
+
# Read in the config
|
|
264
|
+
self._config = Config.from_file(self._config_file)
|
|
265
|
+
self._file_cache_size_g = self._config.get_float_value('file_cache_size_g')
|
|
266
|
+
if self._file_cache_size_g is None:
|
|
267
|
+
raise excs.Error(
|
|
268
|
+
'pixeltable/file_cache_size_g is missing from configuration\n'
|
|
269
|
+
f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {self._config_file},\n'
|
|
270
|
+
'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Disable spurious warnings
|
|
274
|
+
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
275
|
+
if self._config.get_bool_value('hide_warnings'):
|
|
276
|
+
# Disable more warnings
|
|
277
|
+
warnings.simplefilter('ignore', category=UserWarning)
|
|
278
|
+
|
|
276
279
|
# configure _logger to log to a file
|
|
277
280
|
self._logfilename = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log'
|
|
278
281
|
fh = logging.FileHandler(self._log_dir / self._logfilename, mode='w')
|
|
@@ -312,7 +315,7 @@ class Env:
|
|
|
312
315
|
self._db_server = pixeltable_pgserver.get_server(self._pgdata_dir, cleanup_mode=None)
|
|
313
316
|
self._db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
|
|
314
317
|
|
|
315
|
-
tz_name =
|
|
318
|
+
tz_name = self.config.get_string_value('time_zone')
|
|
316
319
|
if tz_name is not None:
|
|
317
320
|
# Validate tzname
|
|
318
321
|
if not isinstance(tz_name, str):
|
|
@@ -439,21 +442,18 @@ class Env:
|
|
|
439
442
|
if cl.client_obj is not None:
|
|
440
443
|
return cl.client_obj # Already initialized
|
|
441
444
|
|
|
442
|
-
# Construct a client
|
|
443
|
-
# if not, look in Pixeltable config from `config.yaml`.
|
|
445
|
+
# Construct a client, retrieving each parameter from config.
|
|
444
446
|
|
|
445
447
|
init_kwargs: dict[str, str] = {}
|
|
446
448
|
for param in cl.param_names:
|
|
447
|
-
|
|
448
|
-
if
|
|
449
|
-
init_kwargs[param] =
|
|
450
|
-
|
|
451
|
-
init_kwargs[param] = self._config[name.lower()][param.lower()]
|
|
452
|
-
if param not in init_kwargs or init_kwargs[param] == '':
|
|
449
|
+
arg = self._config.get_string_value(param, section=name)
|
|
450
|
+
if arg is not None and len(arg) > 0:
|
|
451
|
+
init_kwargs[param] = arg
|
|
452
|
+
else:
|
|
453
453
|
raise excs.Error(
|
|
454
454
|
f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
|
|
455
|
-
f'To fix this, specify the `{
|
|
456
|
-
f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.
|
|
455
|
+
f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, or put `{param.lower()}` in '
|
|
456
|
+
f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
457
457
|
)
|
|
458
458
|
|
|
459
459
|
cl.client_obj = cl.init_fn(**init_kwargs)
|
|
@@ -486,6 +486,8 @@ class Env:
|
|
|
486
486
|
"""Check for and start runtime services"""
|
|
487
487
|
self._start_web_server()
|
|
488
488
|
self.__register_packages()
|
|
489
|
+
if self.is_installed_package('spacy'):
|
|
490
|
+
self.__init_spacy()
|
|
489
491
|
|
|
490
492
|
def __register_packages(self) -> None:
|
|
491
493
|
"""Declare optional packages that are utilized by some parts of the code."""
|
|
@@ -500,10 +502,9 @@ class Env:
|
|
|
500
502
|
self.__register_package('openpyxl')
|
|
501
503
|
self.__register_package('pyarrow')
|
|
502
504
|
self.__register_package('sentence_transformers', library_name='sentence-transformers')
|
|
503
|
-
self.__register_package('spacy')
|
|
505
|
+
self.__register_package('spacy')
|
|
504
506
|
self.__register_package('tiktoken')
|
|
505
507
|
self.__register_package('together')
|
|
506
|
-
self.__register_package('toml')
|
|
507
508
|
self.__register_package('torch')
|
|
508
509
|
self.__register_package('torchvision')
|
|
509
510
|
self.__register_package('transformers')
|
|
@@ -511,10 +512,6 @@ class Env:
|
|
|
511
512
|
self.__register_package('whisperx')
|
|
512
513
|
self.__register_package('yolox', library_name='git+https://github.com/Megvii-BaseDetection/YOLOX@ac58e0a')
|
|
513
514
|
|
|
514
|
-
if self.is_installed_package('spacy'):
|
|
515
|
-
import spacy
|
|
516
|
-
self._spacy_nlp = spacy.load('en_core_web_sm')
|
|
517
|
-
|
|
518
515
|
def __register_package(self, package_name: str, library_name: Optional[str] = None) -> None:
|
|
519
516
|
self.__optional_packages[package_name] = PackageInfo(
|
|
520
517
|
is_installed=importlib.util.find_spec(package_name) is not None,
|
|
@@ -556,6 +553,35 @@ class Env:
|
|
|
556
553
|
f'To fix this, run: `pip install -U {package_info.library_name}`'
|
|
557
554
|
)
|
|
558
555
|
|
|
556
|
+
def __init_spacy(self) -> None:
|
|
557
|
+
"""
|
|
558
|
+
spaCy relies on a pip-installed model to operate. In order to avoid requiring the model as a separate
|
|
559
|
+
dependency, we install it programmatically here. This should cause no problems, since the model packages
|
|
560
|
+
have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
|
|
561
|
+
"""
|
|
562
|
+
import spacy
|
|
563
|
+
from spacy.cli.download import get_model_filename
|
|
564
|
+
spacy_model = 'en_core_web_sm'
|
|
565
|
+
spacy_model_version = '3.7.1'
|
|
566
|
+
filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
|
|
567
|
+
url = f'{spacy.about.__download_url__}/{filename}'
|
|
568
|
+
# Try to `pip install` the model. We set check=False; if the pip command fails, it's not necessarily
|
|
569
|
+
# a problem, because the model have been installed on a previous attempt.
|
|
570
|
+
self._logger.info(f'Ensuring spaCy model is installed: {filename}')
|
|
571
|
+
ret = subprocess.run([sys.executable, '-m', 'pip', 'install', '-qU', url], check=False)
|
|
572
|
+
if ret.returncode != 0:
|
|
573
|
+
self._logger.warn(f'pip install failed for spaCy model: {filename}')
|
|
574
|
+
try:
|
|
575
|
+
self._logger.info(f'Loading spaCy model: {spacy_model}')
|
|
576
|
+
self._spacy_nlp = spacy.load(spacy_model)
|
|
577
|
+
except Exception as exc:
|
|
578
|
+
self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
|
|
579
|
+
warnings.warn(
|
|
580
|
+
f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
|
|
581
|
+
excs.PixeltableWarning
|
|
582
|
+
)
|
|
583
|
+
self.__optional_packages['spacy'].is_installed = False
|
|
584
|
+
|
|
559
585
|
def num_tmp_files(self) -> int:
|
|
560
586
|
return len(glob.glob(f'{self._tmp_dir}/*'))
|
|
561
587
|
|
|
@@ -594,6 +620,7 @@ class Env:
|
|
|
594
620
|
|
|
595
621
|
@property
|
|
596
622
|
def spacy_nlp(self) -> spacy.Language:
|
|
623
|
+
Env.get().require_package('spacy')
|
|
597
624
|
assert self._spacy_nlp is not None
|
|
598
625
|
return self._spacy_nlp
|
|
599
626
|
|
|
@@ -614,7 +641,7 @@ def register_client(name: str) -> Callable:
|
|
|
614
641
|
Pixeltable will attempt to load the client parameters from config. For each
|
|
615
642
|
config parameter:
|
|
616
643
|
- If an environment variable named MY_CLIENT_API_KEY (for example) is set, use it;
|
|
617
|
-
- Otherwise, look for 'api_key' in the 'my_client' section of config.
|
|
644
|
+
- Otherwise, look for 'api_key' in the 'my_client' section of config.toml.
|
|
618
645
|
|
|
619
646
|
If all config parameters are found, Pixeltable calls the initialization function;
|
|
620
647
|
otherwise it throws an exception.
|
|
@@ -631,6 +658,79 @@ def register_client(name: str) -> Callable:
|
|
|
631
658
|
return decorator
|
|
632
659
|
|
|
633
660
|
|
|
661
|
+
class Config:
|
|
662
|
+
"""
|
|
663
|
+
The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
|
|
664
|
+
configuration values, which can be set in the config file or as environment variables.
|
|
665
|
+
"""
|
|
666
|
+
__config: dict[str, Any]
|
|
667
|
+
|
|
668
|
+
T = TypeVar('T')
|
|
669
|
+
|
|
670
|
+
@classmethod
|
|
671
|
+
def from_file(cls, path: Path) -> Config:
|
|
672
|
+
"""
|
|
673
|
+
Loads configuration from the specified TOML file. If the file does not exist, it will be
|
|
674
|
+
created and populated with the default configuration.
|
|
675
|
+
"""
|
|
676
|
+
if os.path.isfile(path):
|
|
677
|
+
with open(path, 'r') as stream:
|
|
678
|
+
try:
|
|
679
|
+
config_dict = toml.load(stream)
|
|
680
|
+
except Exception as exc:
|
|
681
|
+
raise excs.Error(f'Could not read config file: {str(path)}') from exc
|
|
682
|
+
else:
|
|
683
|
+
config_dict = cls.__create_default_config(path)
|
|
684
|
+
with open(path, 'w') as stream:
|
|
685
|
+
try:
|
|
686
|
+
toml.dump(config_dict, stream)
|
|
687
|
+
except Exception as exc:
|
|
688
|
+
raise excs.Error(f'Could not write config file: {str(path)}') from exc
|
|
689
|
+
logging.getLogger('pixeltable').info(f'Created default config file at: {str(path)}')
|
|
690
|
+
return cls(config_dict)
|
|
691
|
+
|
|
692
|
+
@classmethod
|
|
693
|
+
def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
|
|
694
|
+
free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
|
|
695
|
+
# Default cache size is 1/5 of free disk space
|
|
696
|
+
file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
|
|
697
|
+
return {
|
|
698
|
+
'pixeltable': {
|
|
699
|
+
'file_cache_size_g': round(file_cache_size_g, 1),
|
|
700
|
+
'hide_warnings': False,
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
def __init__(self, config: dict[str, Any]) -> None:
|
|
705
|
+
self.__config = config
|
|
706
|
+
|
|
707
|
+
def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
|
|
708
|
+
env_var = f'{section.upper()}_{key.upper()}'
|
|
709
|
+
if env_var in os.environ:
|
|
710
|
+
value = os.environ[env_var]
|
|
711
|
+
elif section in self.__config and key in self.__config[section]:
|
|
712
|
+
value = self.__config[section][key]
|
|
713
|
+
else:
|
|
714
|
+
return None
|
|
715
|
+
|
|
716
|
+
try:
|
|
717
|
+
return expected_type(value) # type: ignore[call-arg]
|
|
718
|
+
except ValueError:
|
|
719
|
+
raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
|
|
720
|
+
|
|
721
|
+
def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
|
|
722
|
+
return self.get_value(key, str, section)
|
|
723
|
+
|
|
724
|
+
def get_int_value(self, key: str, section: str = 'pixeltable') -> Optional[int]:
|
|
725
|
+
return self.get_value(key, int, section)
|
|
726
|
+
|
|
727
|
+
def get_float_value(self, key: str, section: str = 'pixeltable') -> Optional[float]:
|
|
728
|
+
return self.get_value(key, float, section)
|
|
729
|
+
|
|
730
|
+
def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
|
|
731
|
+
return self.get_value(key, bool, section)
|
|
732
|
+
|
|
733
|
+
|
|
634
734
|
_registered_clients: dict[str, ApiClient] = {}
|
|
635
735
|
|
|
636
736
|
|
pixeltable/exceptions.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
|
-
from typing import List, Any
|
|
2
|
-
from types import TracebackType
|
|
3
1
|
from dataclasses import dataclass
|
|
2
|
+
from types import TracebackType
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from pixeltable import exprs
|
|
4
7
|
|
|
5
8
|
|
|
6
9
|
class Error(Exception):
|
|
@@ -9,9 +12,13 @@ class Error(Exception):
|
|
|
9
12
|
|
|
10
13
|
@dataclass
|
|
11
14
|
class ExprEvalError(Exception):
|
|
12
|
-
expr:
|
|
15
|
+
expr: 'exprs.Expr'
|
|
13
16
|
expr_msg: str
|
|
14
17
|
exc: Exception
|
|
15
18
|
exc_tb: TracebackType
|
|
16
|
-
input_vals:
|
|
17
|
-
row_num: int
|
|
19
|
+
input_vals: list[Any]
|
|
20
|
+
row_num: int
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PixeltableWarning(Warning):
|
|
24
|
+
pass
|
|
@@ -1,10 +1,12 @@
|
|
|
1
|
-
|
|
1
|
+
import inspect
|
|
2
|
+
from typing import Iterator, Optional
|
|
2
3
|
|
|
3
|
-
from .data_row_batch import DataRowBatch
|
|
4
|
-
from .exec_node import ExecNode
|
|
5
4
|
import pixeltable.catalog as catalog
|
|
6
|
-
import pixeltable.exprs as exprs
|
|
7
5
|
import pixeltable.exceptions as excs
|
|
6
|
+
import pixeltable.exprs as exprs
|
|
7
|
+
|
|
8
|
+
from .data_row_batch import DataRowBatch
|
|
9
|
+
from .exec_node import ExecNode
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class ComponentIterationNode(ExecNode):
|
|
@@ -12,7 +14,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
12
14
|
|
|
13
15
|
Returns row batches of OUTPUT_BATCH_SIZE size.
|
|
14
16
|
"""
|
|
15
|
-
|
|
17
|
+
__OUTPUT_BATCH_SIZE = 1024
|
|
16
18
|
|
|
17
19
|
def __init__(self, view: catalog.TableVersion, input: ExecNode):
|
|
18
20
|
assert view.is_component_view()
|
|
@@ -23,57 +25,76 @@ class ComponentIterationNode(ExecNode):
|
|
|
23
25
|
self.iterator_args = iterator_args[0]
|
|
24
26
|
assert isinstance(self.iterator_args, exprs.InlineDict)
|
|
25
27
|
self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
|
|
26
|
-
self.iterator_output_schema, self.unstored_column_names =
|
|
27
|
-
self.view.iterator_cls.output_schema(**self.iterator_args.
|
|
28
|
+
self.iterator_output_schema, self.unstored_column_names = (
|
|
29
|
+
self.view.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
|
|
30
|
+
)
|
|
28
31
|
self.iterator_output_fields = list(self.iterator_output_schema.keys())
|
|
29
|
-
self.iterator_output_cols =
|
|
30
|
-
|
|
32
|
+
self.iterator_output_cols = {
|
|
33
|
+
field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields
|
|
34
|
+
}
|
|
31
35
|
# referenced iterator output fields
|
|
32
36
|
self.refd_output_slot_idxs = {
|
|
33
37
|
e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
|
|
34
38
|
if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
|
|
35
39
|
}
|
|
36
|
-
self.
|
|
40
|
+
self.__output: Optional[Iterator[DataRowBatch]] = None
|
|
37
41
|
|
|
38
|
-
def
|
|
42
|
+
def __output_batches(self) -> Iterator[DataRowBatch]:
|
|
39
43
|
output_batch = DataRowBatch(self.view, self.row_builder)
|
|
40
44
|
for input_batch in self.input:
|
|
41
45
|
for input_row in input_batch:
|
|
42
46
|
self.row_builder.eval(input_row, self.iterator_args_ctx)
|
|
43
47
|
iterator_args = input_row[self.iterator_args.slot_idx]
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
# we can ignore this
|
|
59
|
-
continue
|
|
60
|
-
output_col = self.iterator_output_cols[field_name]
|
|
61
|
-
output_col.col_type.validate_literal(field_val)
|
|
62
|
-
output_row[self.refd_output_slot_idxs[field_name]] = field_val
|
|
63
|
-
if len(component_dict) != len(self.iterator_output_fields):
|
|
64
|
-
missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
|
|
65
|
-
raise excs.Error(
|
|
66
|
-
f'Invalid output of {self.view.iterator_cls.__name__}: '
|
|
67
|
-
f'missing fields {", ".join(missing_fields)}')
|
|
68
|
-
|
|
69
|
-
if len(output_batch) == self.OUTPUT_BATCH_SIZE:
|
|
70
|
-
yield output_batch
|
|
71
|
-
output_batch = DataRowBatch(self.view, self.row_builder)
|
|
48
|
+
assert isinstance(iterator_args, dict)
|
|
49
|
+
# We need to ensure that all of the required (non-nullable) parameters of the iterator are
|
|
50
|
+
# specified and are not null. If any of them are null, then we skip this row (i.e., we emit 0
|
|
51
|
+
# output rows for this input row).
|
|
52
|
+
if self.__non_nullable_args_specified(iterator_args):
|
|
53
|
+
iterator = self.view.iterator_cls(**iterator_args)
|
|
54
|
+
for pos, component_dict in enumerate(iterator):
|
|
55
|
+
output_row = output_batch.add_row()
|
|
56
|
+
input_row.copy(output_row)
|
|
57
|
+
# we're expanding the input and need to add the iterator position to the pk
|
|
58
|
+
self.__populate_output_row(output_row, pos, component_dict)
|
|
59
|
+
if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
|
|
60
|
+
yield output_batch
|
|
61
|
+
output_batch = DataRowBatch(self.view, self.row_builder)
|
|
72
62
|
|
|
73
63
|
if len(output_batch) > 0:
|
|
74
64
|
yield output_batch
|
|
75
65
|
|
|
66
|
+
def __non_nullable_args_specified(self, iterator_args: dict) -> bool:
|
|
67
|
+
"""
|
|
68
|
+
Returns true if all non-nullable iterator arguments are not `None`.
|
|
69
|
+
"""
|
|
70
|
+
input_schema = self.view.iterator_cls.input_schema()
|
|
71
|
+
for arg_name, arg_value in iterator_args.items():
|
|
72
|
+
col_type = input_schema[arg_name]
|
|
73
|
+
if arg_value is None and not col_type.nullable:
|
|
74
|
+
return False
|
|
75
|
+
return True
|
|
76
|
+
|
|
77
|
+
def __populate_output_row(self, output_row: exprs.DataRow, pos: int, component_dict: dict) -> None:
|
|
78
|
+
pk = output_row.pk[:-1] + (pos,) + output_row.pk[-1:]
|
|
79
|
+
output_row.set_pk(pk)
|
|
80
|
+
# verify and copy component_dict fields to their respective slots in output_row
|
|
81
|
+
for field_name, field_val in component_dict.items():
|
|
82
|
+
if field_name not in self.iterator_output_fields:
|
|
83
|
+
raise excs.Error(
|
|
84
|
+
f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
|
|
85
|
+
if field_name not in self.refd_output_slot_idxs:
|
|
86
|
+
# we can ignore this
|
|
87
|
+
continue
|
|
88
|
+
output_col = self.iterator_output_cols[field_name]
|
|
89
|
+
output_col.col_type.validate_literal(field_val)
|
|
90
|
+
output_row[self.refd_output_slot_idxs[field_name]] = field_val
|
|
91
|
+
if len(component_dict) != len(self.iterator_output_fields):
|
|
92
|
+
missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
|
|
93
|
+
raise excs.Error(
|
|
94
|
+
f'Invalid output of {self.view.iterator_cls.__name__}: '
|
|
95
|
+
f'missing fields {", ".join(missing_fields)}')
|
|
96
|
+
|
|
76
97
|
def __next__(self) -> DataRowBatch:
|
|
77
|
-
if self.
|
|
78
|
-
self.
|
|
79
|
-
return next(self.
|
|
98
|
+
if self.__output is None:
|
|
99
|
+
self.__output = self.__output_batches()
|
|
100
|
+
return next(self.__output)
|
pixeltable/exprs/__init__.py
CHANGED
|
@@ -9,8 +9,7 @@ from .expr import Expr
|
|
|
9
9
|
from .expr_set import ExprSet
|
|
10
10
|
from .function_call import FunctionCall
|
|
11
11
|
from .in_predicate import InPredicate
|
|
12
|
-
from .
|
|
13
|
-
from .inline_dict import InlineDict
|
|
12
|
+
from .inline_expr import InlineArray, InlineDict, InlineList
|
|
14
13
|
from .is_null import IsNull
|
|
15
14
|
from .json_mapper import JsonMapper
|
|
16
15
|
from .json_path import RELATIVE_PATH_ROOT, JsonPath
|
pixeltable/exprs/expr.py
CHANGED
|
@@ -356,15 +356,14 @@ class Expr(abc.ABC):
|
|
|
356
356
|
"""
|
|
357
357
|
if isinstance(o, Expr):
|
|
358
358
|
return o
|
|
359
|
-
# Try to create a literal. We need to check for
|
|
360
|
-
# first, to prevent
|
|
359
|
+
# Try to create a literal. We need to check for InlineList/InlineDict
|
|
360
|
+
# first, to prevent them from inappropriately being interpreted as JsonType
|
|
361
361
|
# literals.
|
|
362
|
-
# TODO: general cleanup of InlineArray/InlineDict
|
|
363
362
|
if isinstance(o, list):
|
|
364
|
-
from .
|
|
365
|
-
return
|
|
363
|
+
from .inline_expr import InlineList
|
|
364
|
+
return InlineList(o)
|
|
366
365
|
if isinstance(o, dict):
|
|
367
|
-
from .
|
|
366
|
+
from .inline_expr import InlineDict
|
|
368
367
|
return InlineDict(o)
|
|
369
368
|
obj_type = ts.ColumnType.infer_literal_type(o)
|
|
370
369
|
if obj_type is not None:
|