pixeltable 0.4.18__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/_version.py +1 -0
- pixeltable/catalog/catalog.py +119 -100
- pixeltable/catalog/column.py +104 -115
- pixeltable/catalog/globals.py +1 -2
- pixeltable/catalog/insertable_table.py +44 -49
- pixeltable/catalog/path.py +3 -4
- pixeltable/catalog/schema_object.py +4 -4
- pixeltable/catalog/table.py +118 -122
- pixeltable/catalog/table_metadata.py +6 -6
- pixeltable/catalog/table_version.py +322 -257
- pixeltable/catalog/table_version_handle.py +4 -4
- pixeltable/catalog/table_version_path.py +9 -10
- pixeltable/catalog/tbl_ops.py +9 -3
- pixeltable/catalog/view.py +34 -28
- pixeltable/config.py +14 -10
- pixeltable/dataframe.py +68 -77
- pixeltable/env.py +74 -64
- pixeltable/exec/aggregation_node.py +6 -6
- pixeltable/exec/cache_prefetch_node.py +10 -10
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +4 -5
- pixeltable/exec/exec_node.py +5 -5
- pixeltable/exec/expr_eval/evaluators.py +6 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
- pixeltable/exec/expr_eval/globals.py +6 -6
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +11 -11
- pixeltable/exec/in_memory_data_node.py +2 -2
- pixeltable/exec/object_store_save_node.py +14 -17
- pixeltable/exec/sql_node.py +25 -25
- pixeltable/exprs/arithmetic_expr.py +4 -4
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/column_property_ref.py +3 -3
- pixeltable/exprs/column_ref.py +61 -74
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +3 -3
- pixeltable/exprs/data_row.py +12 -12
- pixeltable/exprs/expr.py +41 -31
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +3 -3
- pixeltable/exprs/function_call.py +14 -14
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +8 -8
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +6 -6
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +14 -14
- pixeltable/exprs/rowid_ref.py +8 -8
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +2 -2
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +3 -3
- pixeltable/func/function.py +15 -17
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +2 -2
- pixeltable/func/query_template_function.py +16 -16
- pixeltable/func/signature.py +14 -14
- pixeltable/func/tools.py +11 -11
- pixeltable/func/udf.py +16 -18
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +7 -7
- pixeltable/functions/audio.py +76 -0
- pixeltable/functions/bedrock.py +6 -6
- pixeltable/functions/deepseek.py +4 -4
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +6 -6
- pixeltable/functions/globals.py +12 -12
- pixeltable/functions/groq.py +4 -4
- pixeltable/functions/huggingface.py +18 -20
- pixeltable/functions/image.py +7 -10
- pixeltable/functions/llama_cpp.py +7 -7
- pixeltable/functions/math.py +2 -3
- pixeltable/functions/mistralai.py +3 -3
- pixeltable/functions/ollama.py +9 -9
- pixeltable/functions/openai.py +21 -21
- pixeltable/functions/openrouter.py +7 -7
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +7 -8
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/video.py +2 -24
- pixeltable/functions/vision.py +6 -6
- pixeltable/functions/whisper.py +7 -7
- pixeltable/functions/whisperx.py +16 -16
- pixeltable/globals.py +52 -36
- pixeltable/index/base.py +12 -8
- pixeltable/index/btree.py +19 -22
- pixeltable/index/embedding_index.py +30 -39
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/external_store.py +13 -16
- pixeltable/io/fiftyone.py +5 -5
- pixeltable/io/globals.py +5 -5
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +12 -12
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +12 -12
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/audio.py +2 -2
- pixeltable/iterators/video.py +8 -13
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_38.py +2 -2
- pixeltable/metadata/converters/convert_39.py +1 -2
- pixeltable/metadata/converters/util.py +11 -13
- pixeltable/metadata/schema.py +22 -21
- pixeltable/metadata/utils.py +2 -6
- pixeltable/mypy/mypy_plugin.py +5 -5
- pixeltable/plan.py +30 -28
- pixeltable/share/packager.py +7 -7
- pixeltable/share/publish.py +3 -3
- pixeltable/store.py +125 -61
- pixeltable/type_system.py +43 -46
- pixeltable/utils/__init__.py +1 -2
- pixeltable/utils/arrow.py +4 -4
- pixeltable/utils/av.py +8 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +1 -2
- pixeltable/utils/dbms.py +15 -19
- pixeltable/utils/description_helper.py +2 -3
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +2 -2
- pixeltable/utils/filecache.py +5 -5
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +9 -9
- pixeltable/utils/local_store.py +17 -17
- pixeltable/utils/object_stores.py +59 -43
- pixeltable/utils/s3_store.py +35 -30
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/METADATA +1 -1
- pixeltable-0.4.19.dist-info/RECORD +213 -0
- pixeltable/__version__.py +0 -3
- pixeltable-0.4.18.dist-info/RECORD +0 -211
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
pixeltable/env.py
CHANGED
|
@@ -21,7 +21,7 @@ from contextlib import contextmanager
|
|
|
21
21
|
from dataclasses import dataclass, field
|
|
22
22
|
from pathlib import Path
|
|
23
23
|
from sys import stdout
|
|
24
|
-
from typing import TYPE_CHECKING, Any, Callable, Iterator,
|
|
24
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, TypeVar
|
|
25
25
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
26
26
|
|
|
27
27
|
import nest_asyncio # type: ignore[import-untyped]
|
|
@@ -38,7 +38,7 @@ from pixeltable.config import Config
|
|
|
38
38
|
from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
|
|
39
39
|
from pixeltable.utils.dbms import CockroachDbms, Dbms, PostgresqlDbms
|
|
40
40
|
from pixeltable.utils.http_server import make_server
|
|
41
|
-
from pixeltable.utils.object_stores import ObjectPath
|
|
41
|
+
from pixeltable.utils.object_stores import ObjectPath
|
|
42
42
|
|
|
43
43
|
if TYPE_CHECKING:
|
|
44
44
|
import spacy
|
|
@@ -56,47 +56,50 @@ class Env:
|
|
|
56
56
|
For a non-local environment, Pixeltable uses a connection string to the externally managed database.
|
|
57
57
|
"""
|
|
58
58
|
|
|
59
|
-
|
|
59
|
+
SERIALIZABLE_ISOLATION_LEVEL = 'SERIALIZABLE'
|
|
60
|
+
|
|
61
|
+
_instance: Env | None = None
|
|
60
62
|
__initializing: bool = False
|
|
61
63
|
_log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
|
|
62
64
|
|
|
63
|
-
_media_dir:
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
_default_time_zone: Optional[ZoneInfo]
|
|
65
|
+
_media_dir: Path | None
|
|
66
|
+
_file_cache_dir: Path | None # cached object files with external URL
|
|
67
|
+
_dataset_cache_dir: Path | None # cached datasets (eg, pytorch or COCO)
|
|
68
|
+
_log_dir: Path | None # log files
|
|
69
|
+
_tmp_dir: Path | None # any tmp files
|
|
70
|
+
_sa_engine: sql.engine.base.Engine | None
|
|
71
|
+
_pgdata_dir: Path | None
|
|
72
|
+
_db_name: str | None
|
|
73
|
+
_db_server: pixeltable_pgserver.PostgresServer | None # set only when running in local environment
|
|
74
|
+
_db_url: str | None
|
|
75
|
+
_default_time_zone: ZoneInfo | None
|
|
75
76
|
_verbosity: int
|
|
76
77
|
|
|
77
78
|
# info about optional packages that are utilized by some parts of the code
|
|
78
79
|
__optional_packages: dict[str, PackageInfo]
|
|
79
80
|
|
|
80
|
-
_spacy_nlp:
|
|
81
|
-
_httpd:
|
|
82
|
-
_http_address:
|
|
81
|
+
_spacy_nlp: spacy.Language | None
|
|
82
|
+
_httpd: http.server.HTTPServer | None
|
|
83
|
+
_http_address: str | None
|
|
83
84
|
_logger: logging.Logger
|
|
84
85
|
_default_log_level: int
|
|
85
|
-
_logfilename:
|
|
86
|
+
_logfilename: str | None
|
|
86
87
|
_log_to_stdout: bool
|
|
87
88
|
_module_log_level: dict[str, int] # module name -> log level
|
|
88
89
|
_file_cache_size_g: float
|
|
89
|
-
|
|
90
|
+
_default_input_media_dest: str | None
|
|
91
|
+
_default_output_media_dest: str | None
|
|
92
|
+
_pxt_api_key: str | None
|
|
90
93
|
_stdout_handler: logging.StreamHandler
|
|
91
94
|
_default_video_encoder: str | None
|
|
92
95
|
_initialized: bool
|
|
93
96
|
|
|
94
97
|
_resource_pool_info: dict[str, Any]
|
|
95
|
-
_current_conn:
|
|
96
|
-
_current_session:
|
|
97
|
-
_current_isolation_level:
|
|
98
|
-
_dbms:
|
|
99
|
-
_event_loop:
|
|
98
|
+
_current_conn: sql.Connection | None
|
|
99
|
+
_current_session: orm.Session | None
|
|
100
|
+
_current_isolation_level: str | None
|
|
101
|
+
_dbms: Dbms | None
|
|
102
|
+
_event_loop: asyncio.AbstractEventLoop | None # event loop for ExecNode
|
|
100
103
|
|
|
101
104
|
@classmethod
|
|
102
105
|
def get(cls) -> Env:
|
|
@@ -125,7 +128,6 @@ class Env:
|
|
|
125
128
|
assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
|
|
126
129
|
|
|
127
130
|
self._media_dir = None # computed media files
|
|
128
|
-
self._object_soa = None # computed object files in StorageObjectAddress format
|
|
129
131
|
self._file_cache_dir = None # cached object files with external URL
|
|
130
132
|
self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
|
|
131
133
|
self._log_dir = None # log files
|
|
@@ -200,11 +202,11 @@ class Env:
|
|
|
200
202
|
return self._http_address
|
|
201
203
|
|
|
202
204
|
@property
|
|
203
|
-
def user(self) ->
|
|
205
|
+
def user(self) -> str | None:
|
|
204
206
|
return Config.get().get_string_value('user')
|
|
205
207
|
|
|
206
208
|
@user.setter
|
|
207
|
-
def user(self, user:
|
|
209
|
+
def user(self, user: str | None) -> None:
|
|
208
210
|
if user is None:
|
|
209
211
|
if 'PIXELTABLE_USER' in os.environ:
|
|
210
212
|
del os.environ['PIXELTABLE_USER']
|
|
@@ -212,11 +214,11 @@ class Env:
|
|
|
212
214
|
os.environ['PIXELTABLE_USER'] = user
|
|
213
215
|
|
|
214
216
|
@property
|
|
215
|
-
def default_time_zone(self) ->
|
|
217
|
+
def default_time_zone(self) -> ZoneInfo | None:
|
|
216
218
|
return self._default_time_zone
|
|
217
219
|
|
|
218
220
|
@default_time_zone.setter
|
|
219
|
-
def default_time_zone(self, tz:
|
|
221
|
+
def default_time_zone(self, tz: ZoneInfo | None) -> None:
|
|
220
222
|
"""
|
|
221
223
|
This is not a publicly visible setter; it is only for testing purposes.
|
|
222
224
|
"""
|
|
@@ -233,17 +235,17 @@ class Env:
|
|
|
233
235
|
return self._verbosity
|
|
234
236
|
|
|
235
237
|
@property
|
|
236
|
-
def conn(self) ->
|
|
238
|
+
def conn(self) -> sql.Connection | None:
|
|
237
239
|
assert self._current_conn is not None
|
|
238
240
|
return self._current_conn
|
|
239
241
|
|
|
240
242
|
@property
|
|
241
|
-
def session(self) ->
|
|
243
|
+
def session(self) -> orm.Session | None:
|
|
242
244
|
assert self._current_session is not None
|
|
243
245
|
return self._current_session
|
|
244
246
|
|
|
245
247
|
@property
|
|
246
|
-
def dbms(self) ->
|
|
248
|
+
def dbms(self) -> Dbms | None:
|
|
247
249
|
assert self._dbms is not None
|
|
248
250
|
return self._dbms
|
|
249
251
|
|
|
@@ -274,7 +276,7 @@ class Env:
|
|
|
274
276
|
if self._current_conn is None:
|
|
275
277
|
assert self._current_session is None
|
|
276
278
|
try:
|
|
277
|
-
self._current_isolation_level =
|
|
279
|
+
self._current_isolation_level = self.SERIALIZABLE_ISOLATION_LEVEL
|
|
278
280
|
with (
|
|
279
281
|
self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
|
|
280
282
|
orm.Session(conn) as session,
|
|
@@ -289,16 +291,16 @@ class Env:
|
|
|
289
291
|
self._current_isolation_level = None
|
|
290
292
|
else:
|
|
291
293
|
assert self._current_session is not None
|
|
292
|
-
assert
|
|
294
|
+
assert self._current_isolation_level == self.SERIALIZABLE_ISOLATION_LEVEL or not for_write
|
|
293
295
|
yield self._current_conn
|
|
294
296
|
|
|
295
297
|
def configure_logging(
|
|
296
298
|
self,
|
|
297
299
|
*,
|
|
298
|
-
to_stdout:
|
|
299
|
-
level:
|
|
300
|
-
add:
|
|
301
|
-
remove:
|
|
300
|
+
to_stdout: bool | None = None,
|
|
301
|
+
level: int | None = None,
|
|
302
|
+
add: str | None = None,
|
|
303
|
+
remove: str | None = None,
|
|
302
304
|
) -> None:
|
|
303
305
|
"""Configure logging.
|
|
304
306
|
|
|
@@ -340,7 +342,7 @@ class Env:
|
|
|
340
342
|
def set_log_level(self, level: int) -> None:
|
|
341
343
|
self._default_log_level = level
|
|
342
344
|
|
|
343
|
-
def set_module_log_level(self, module: str, level:
|
|
345
|
+
def set_module_log_level(self, module: str, level: int | None) -> None:
|
|
344
346
|
if level is None:
|
|
345
347
|
self._module_log_level.pop(module, None)
|
|
346
348
|
else:
|
|
@@ -396,23 +398,18 @@ class Env:
|
|
|
396
398
|
config = Config.get()
|
|
397
399
|
|
|
398
400
|
self._initialized = True
|
|
401
|
+
|
|
399
402
|
self._media_dir = Config.get().home / 'media'
|
|
400
403
|
self._file_cache_dir = Config.get().home / 'file_cache'
|
|
401
404
|
self._dataset_cache_dir = Config.get().home / 'dataset_cache'
|
|
402
405
|
self._log_dir = Config.get().home / 'logs'
|
|
403
406
|
self._tmp_dir = Config.get().home / 'tmp'
|
|
404
407
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
self.
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
if not self._dataset_cache_dir.exists():
|
|
411
|
-
self._dataset_cache_dir.mkdir()
|
|
412
|
-
if not self._log_dir.exists():
|
|
413
|
-
self._log_dir.mkdir()
|
|
414
|
-
if not self._tmp_dir.exists():
|
|
415
|
-
self._tmp_dir.mkdir()
|
|
408
|
+
self._media_dir.mkdir(exist_ok=True)
|
|
409
|
+
self._file_cache_dir.mkdir(exist_ok=True)
|
|
410
|
+
self._dataset_cache_dir.mkdir(exist_ok=True)
|
|
411
|
+
self._log_dir.mkdir(exist_ok=True)
|
|
412
|
+
self._tmp_dir.mkdir(exist_ok=True)
|
|
416
413
|
|
|
417
414
|
self._file_cache_size_g = config.get_float_value('file_cache_size_g')
|
|
418
415
|
if self._file_cache_size_g is None:
|
|
@@ -421,6 +418,16 @@ class Env:
|
|
|
421
418
|
f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {Config.get().config_file},\n'
|
|
422
419
|
'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
|
|
423
420
|
)
|
|
421
|
+
|
|
422
|
+
self._default_input_media_dest = config.get_string_value('input_media_dest')
|
|
423
|
+
self._default_output_media_dest = config.get_string_value('output_media_dest')
|
|
424
|
+
for mode, uri in (('input', self._default_input_media_dest), ('output', self._default_output_media_dest)):
|
|
425
|
+
if uri is not None:
|
|
426
|
+
try:
|
|
427
|
+
_ = ObjectPath.parse_object_storage_addr(uri, False)
|
|
428
|
+
except Exception as e:
|
|
429
|
+
raise excs.Error(f'Invalid {mode} media destination URI: {uri}') from e
|
|
430
|
+
|
|
424
431
|
self._pxt_api_key = config.get_string_value('api_key')
|
|
425
432
|
|
|
426
433
|
# Disable spurious warnings
|
|
@@ -647,7 +654,7 @@ class Env:
|
|
|
647
654
|
metadata.upgrade_md(self._sa_engine)
|
|
648
655
|
|
|
649
656
|
@property
|
|
650
|
-
def pxt_api_key(self) ->
|
|
657
|
+
def pxt_api_key(self) -> str | None:
|
|
651
658
|
return self._pxt_api_key
|
|
652
659
|
|
|
653
660
|
def get_client(self, name: str) -> Any:
|
|
@@ -669,7 +676,7 @@ class Env:
|
|
|
669
676
|
# Determine the type of the parameter for proper config parsing.
|
|
670
677
|
pname = param.name
|
|
671
678
|
t = param.annotation
|
|
672
|
-
# Deference
|
|
679
|
+
# Deference T | None
|
|
673
680
|
if typing.get_origin(t) in (typing.Union, types.UnionType):
|
|
674
681
|
args = typing.get_args(t)
|
|
675
682
|
if args[0] is type(None):
|
|
@@ -767,6 +774,7 @@ class Env:
|
|
|
767
774
|
self.__register_package('datasets')
|
|
768
775
|
self.__register_package('diffusers')
|
|
769
776
|
self.__register_package('fiftyone')
|
|
777
|
+
self.__register_package('twelvelabs')
|
|
770
778
|
self.__register_package('fireworks', library_name='fireworks-ai')
|
|
771
779
|
self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
|
|
772
780
|
self.__register_package('google.genai', library_name='google-genai')
|
|
@@ -799,7 +807,7 @@ class Env:
|
|
|
799
807
|
self.__register_package('yolox', library_name='pixeltable-yolox')
|
|
800
808
|
self.__register_package('lancedb')
|
|
801
809
|
|
|
802
|
-
def __register_package(self, package_name: str, library_name:
|
|
810
|
+
def __register_package(self, package_name: str, library_name: str | None = None) -> None:
|
|
803
811
|
is_installed: bool
|
|
804
812
|
try:
|
|
805
813
|
is_installed = importlib.util.find_spec(package_name) is not None
|
|
@@ -815,7 +823,7 @@ class Env:
|
|
|
815
823
|
if not shutil.which(binary_name):
|
|
816
824
|
raise excs.Error(f'{binary_name} is not installed or not in PATH. Please install it to use this feature.')
|
|
817
825
|
|
|
818
|
-
def require_package(self, package_name: str, min_version:
|
|
826
|
+
def require_package(self, package_name: str, min_version: list[int] | None = None) -> None:
|
|
819
827
|
"""
|
|
820
828
|
Checks whether the specified optional package is available. If not, raises an exception
|
|
821
829
|
with an error message informing the user how to install it.
|
|
@@ -859,8 +867,8 @@ class Env:
|
|
|
859
867
|
else:
|
|
860
868
|
os.remove(path)
|
|
861
869
|
|
|
862
|
-
# def get_resource_pool_info(self, pool_id: str, pool_info_cls:
|
|
863
|
-
def get_resource_pool_info(self, pool_id: str, make_pool_info:
|
|
870
|
+
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Type[T] | None) -> T:
|
|
871
|
+
def get_resource_pool_info(self, pool_id: str, make_pool_info: Callable[[], T] | None = None) -> T:
|
|
864
872
|
"""Returns the info object for the given id, creating it if necessary."""
|
|
865
873
|
info = self._resource_pool_info.get(pool_id)
|
|
866
874
|
if info is None and make_pool_info is not None:
|
|
@@ -874,10 +882,12 @@ class Env:
|
|
|
874
882
|
return self._media_dir
|
|
875
883
|
|
|
876
884
|
@property
|
|
877
|
-
def
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
885
|
+
def default_input_media_dest(self) -> str | None:
|
|
886
|
+
return self._default_input_media_dest
|
|
887
|
+
|
|
888
|
+
@property
|
|
889
|
+
def default_output_media_dest(self) -> str | None:
|
|
890
|
+
return self._default_output_media_dest
|
|
881
891
|
|
|
882
892
|
@property
|
|
883
893
|
def file_cache_dir(self) -> Path:
|
|
@@ -1025,14 +1035,14 @@ _registered_clients: dict[str, ApiClient] = {}
|
|
|
1025
1035
|
class ApiClient:
|
|
1026
1036
|
init_fn: Callable
|
|
1027
1037
|
params: dict[str, inspect.Parameter]
|
|
1028
|
-
client_obj:
|
|
1038
|
+
client_obj: Any | None = None
|
|
1029
1039
|
|
|
1030
1040
|
|
|
1031
1041
|
@dataclass
|
|
1032
1042
|
class PackageInfo:
|
|
1033
1043
|
is_installed: bool
|
|
1034
1044
|
library_name: str # pypi library name (may be different from package name)
|
|
1035
|
-
version:
|
|
1045
|
+
version: list[int] | None = None # installed version, as a list of components (such as [3,0,2] for "3.0.2")
|
|
1036
1046
|
|
|
1037
1047
|
|
|
1038
1048
|
TIME_FORMAT = '%H:%M.%S %f'
|
|
@@ -1093,7 +1103,7 @@ class RateLimitsInfo:
|
|
|
1093
1103
|
"""Update self.resource_limits based on the exception headers"""
|
|
1094
1104
|
self.has_exc = True
|
|
1095
1105
|
|
|
1096
|
-
def get_retry_delay(self, exc: Exception) ->
|
|
1106
|
+
def get_retry_delay(self, exc: Exception) -> float | None:
|
|
1097
1107
|
"""Returns number of seconds to wait before retry, or None if not retryable"""
|
|
1098
1108
|
if len(self.resource_limits) == 0:
|
|
1099
1109
|
return 1.0
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
|
-
from typing import Any, AsyncIterator, Iterable,
|
|
5
|
+
from typing import Any, AsyncIterator, Iterable, cast
|
|
6
6
|
|
|
7
7
|
from pixeltable import catalog, exceptions as excs, exprs
|
|
8
8
|
|
|
@@ -19,18 +19,18 @@ class AggregationNode(ExecNode):
|
|
|
19
19
|
At the moment, this returns all results in a single DataRowBatch.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
|
-
group_by:
|
|
22
|
+
group_by: list[exprs.Expr] | None
|
|
23
23
|
input_exprs: list[exprs.Expr]
|
|
24
24
|
agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
|
|
25
25
|
agg_fn_calls: list[exprs.FunctionCall]
|
|
26
26
|
output_batch: DataRowBatch
|
|
27
|
-
limit:
|
|
27
|
+
limit: int | None
|
|
28
28
|
|
|
29
29
|
def __init__(
|
|
30
30
|
self,
|
|
31
31
|
tbl: catalog.TableVersionHandle,
|
|
32
32
|
row_builder: exprs.RowBuilder,
|
|
33
|
-
group_by:
|
|
33
|
+
group_by: list[exprs.Expr] | None,
|
|
34
34
|
agg_fn_calls: list[exprs.FunctionCall],
|
|
35
35
|
input_exprs: Iterable[exprs.Expr],
|
|
36
36
|
input: ExecNode,
|
|
@@ -72,8 +72,8 @@ class AggregationNode(ExecNode):
|
|
|
72
72
|
raise excs.ExprEvalError(fn_call, expr_msg, exc, exc_tb, input_vals, row_num) from exc
|
|
73
73
|
|
|
74
74
|
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
75
|
-
prev_row:
|
|
76
|
-
current_group:
|
|
75
|
+
prev_row: exprs.DataRow | None = None
|
|
76
|
+
current_group: list[Any] | None = None # the values of the group-by exprs
|
|
77
77
|
num_input_rows = 0
|
|
78
78
|
num_output_rows = 0
|
|
79
79
|
async for row_batch in self.input:
|
|
@@ -9,7 +9,7 @@ import urllib.request
|
|
|
9
9
|
from collections import deque
|
|
10
10
|
from concurrent import futures
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import AsyncIterator, Iterator
|
|
12
|
+
from typing import AsyncIterator, Iterator
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
15
|
from pixeltable import exceptions as excs, exprs
|
|
@@ -43,18 +43,18 @@ class CachePrefetchNode(ExecNode):
|
|
|
43
43
|
|
|
44
44
|
# ready_rows: rows that are ready to be returned, ordered by row idx;
|
|
45
45
|
# the implied row idx of ready_rows[0] is num_returned_rows
|
|
46
|
-
ready_rows: deque[
|
|
46
|
+
ready_rows: deque[exprs.DataRow | None]
|
|
47
47
|
|
|
48
48
|
in_flight_rows: dict[int, CachePrefetchNode.RowState] # rows with in-flight urls; id(row) -> RowState
|
|
49
49
|
in_flight_requests: dict[futures.Future, str] # in-flight requests for urls; future -> URL
|
|
50
50
|
in_flight_urls: dict[str, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]] # URL -> [(row, info)]
|
|
51
51
|
input_finished: bool
|
|
52
|
-
row_idx: Iterator[
|
|
52
|
+
row_idx: Iterator[int | None]
|
|
53
53
|
|
|
54
54
|
@dataclasses.dataclass
|
|
55
55
|
class RowState:
|
|
56
56
|
row: exprs.DataRow
|
|
57
|
-
idx:
|
|
57
|
+
idx: int | None # position in input stream; None if we don't retain input order
|
|
58
58
|
num_missing: int # number of missing URLs in this row
|
|
59
59
|
|
|
60
60
|
def __init__(
|
|
@@ -78,7 +78,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
78
78
|
def queued_work(self) -> int:
|
|
79
79
|
return len(self.in_flight_requests)
|
|
80
80
|
|
|
81
|
-
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) ->
|
|
81
|
+
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> DataRowBatch | None:
|
|
82
82
|
"""Get the next batch of input rows, or None if there are no more rows"""
|
|
83
83
|
try:
|
|
84
84
|
input_batch = await anext(input_iter)
|
|
@@ -127,7 +127,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
127
127
|
sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
|
|
128
128
|
)
|
|
129
129
|
|
|
130
|
-
def __add_ready_row(self, row: exprs.DataRow, row_idx:
|
|
130
|
+
def __add_ready_row(self, row: exprs.DataRow, row_idx: int | None) -> None:
|
|
131
131
|
if row_idx is None:
|
|
132
132
|
self.ready_rows.append(row)
|
|
133
133
|
else:
|
|
@@ -144,12 +144,12 @@ class CachePrefetchNode(ExecNode):
|
|
|
144
144
|
tmp_path, exc = f.result()
|
|
145
145
|
if exc is not None and not ignore_errors:
|
|
146
146
|
raise exc
|
|
147
|
-
local_path:
|
|
147
|
+
local_path: Path | None = None
|
|
148
148
|
if tmp_path is not None:
|
|
149
149
|
# register the file with the cache for the first column in which it's missing
|
|
150
150
|
assert url in self.in_flight_urls
|
|
151
151
|
_, info = self.in_flight_urls[url][0]
|
|
152
|
-
local_path = file_cache.add(info.col.
|
|
152
|
+
local_path = file_cache.add(info.col.get_tbl().id, info.col.id, url, tmp_path)
|
|
153
153
|
_logger.debug(f'cached {url} as {local_path}')
|
|
154
154
|
|
|
155
155
|
# add the local path/exception to the slots that reference the url
|
|
@@ -174,7 +174,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
174
174
|
# the time it takes to get the next batch together
|
|
175
175
|
cache_misses: list[str] = []
|
|
176
176
|
|
|
177
|
-
url_pos: dict[str,
|
|
177
|
+
url_pos: dict[str, int | None] = {} # url -> row_idx; used for logging
|
|
178
178
|
for row in input_batch:
|
|
179
179
|
# identify missing local files in input batch, or fill in their paths if they're already cached
|
|
180
180
|
num_missing = 0
|
|
@@ -213,7 +213,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
213
213
|
_logger.debug(f'submitted {url} for idx {url_pos[url]}')
|
|
214
214
|
self.in_flight_requests[f] = url
|
|
215
215
|
|
|
216
|
-
def __fetch_url(self, url: str) -> tuple[
|
|
216
|
+
def __fetch_url(self, url: str) -> tuple[Path | None, Exception | None]:
|
|
217
217
|
"""Fetches a remote URL into the TempStore and returns its path"""
|
|
218
218
|
from pixeltable.utils.local_store import TempStore
|
|
219
219
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Iterator
|
|
4
|
+
from typing import Iterator
|
|
5
5
|
|
|
6
6
|
from pixeltable import exprs
|
|
7
7
|
|
|
@@ -19,11 +19,11 @@ class DataRowBatch:
|
|
|
19
19
|
row_builder: exprs.RowBuilder
|
|
20
20
|
rows: list[exprs.DataRow]
|
|
21
21
|
|
|
22
|
-
def __init__(self, row_builder: exprs.RowBuilder, rows:
|
|
22
|
+
def __init__(self, row_builder: exprs.RowBuilder, rows: list[exprs.DataRow] | None = None):
|
|
23
23
|
self.row_builder = row_builder
|
|
24
24
|
self.rows = [] if rows is None else rows
|
|
25
25
|
|
|
26
|
-
def add_row(self, row:
|
|
26
|
+
def add_row(self, row: exprs.DataRow | None) -> exprs.DataRow:
|
|
27
27
|
if row is None:
|
|
28
28
|
row = self.row_builder.make_row()
|
|
29
29
|
self.rows.append(row)
|
pixeltable/exec/exec_context.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import random
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
import sqlalchemy as sql
|
|
5
4
|
|
|
@@ -13,9 +12,9 @@ class ExecContext:
|
|
|
13
12
|
profile: exprs.ExecProfile
|
|
14
13
|
show_pbar: bool
|
|
15
14
|
batch_size: int
|
|
16
|
-
num_rows:
|
|
17
|
-
conn:
|
|
18
|
-
pk_clause:
|
|
15
|
+
num_rows: int | None
|
|
16
|
+
conn: sql.engine.Connection | None
|
|
17
|
+
pk_clause: list[sql.ClauseElement] | None
|
|
19
18
|
num_computed_exprs: int
|
|
20
19
|
ignore_errors: bool
|
|
21
20
|
random_seed: int # general-purpose source of randomness with execution scope
|
|
@@ -26,7 +25,7 @@ class ExecContext:
|
|
|
26
25
|
*,
|
|
27
26
|
show_pbar: bool = False,
|
|
28
27
|
batch_size: int = 0,
|
|
29
|
-
pk_clause:
|
|
28
|
+
pk_clause: list[sql.ClauseElement] | None = None,
|
|
30
29
|
num_computed_exprs: int = 0,
|
|
31
30
|
ignore_errors: bool = False,
|
|
32
31
|
):
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
4
|
import logging
|
|
5
|
-
from typing import AsyncIterator, Iterable, Iterator,
|
|
5
|
+
from typing import AsyncIterator, Iterable, Iterator, TypeVar
|
|
6
6
|
|
|
7
7
|
from pixeltable import exprs
|
|
8
8
|
from pixeltable.env import Env
|
|
@@ -18,16 +18,16 @@ class ExecNode(abc.ABC):
|
|
|
18
18
|
|
|
19
19
|
output_exprs: Iterable[exprs.Expr]
|
|
20
20
|
row_builder: exprs.RowBuilder
|
|
21
|
-
input:
|
|
21
|
+
input: ExecNode | None
|
|
22
22
|
flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
|
|
23
|
-
ctx:
|
|
23
|
+
ctx: ExecContext | None
|
|
24
24
|
|
|
25
25
|
def __init__(
|
|
26
26
|
self,
|
|
27
27
|
row_builder: exprs.RowBuilder,
|
|
28
28
|
output_exprs: Iterable[exprs.Expr],
|
|
29
29
|
input_exprs: Iterable[exprs.Expr],
|
|
30
|
-
input:
|
|
30
|
+
input: ExecNode | None = None,
|
|
31
31
|
):
|
|
32
32
|
assert all(expr.is_valid for expr in output_exprs)
|
|
33
33
|
self.output_exprs = output_exprs
|
|
@@ -85,7 +85,7 @@ class ExecNode(abc.ABC):
|
|
|
85
85
|
|
|
86
86
|
T = TypeVar('T', bound='ExecNode')
|
|
87
87
|
|
|
88
|
-
def get_node(self, node_class: type[T]) ->
|
|
88
|
+
def get_node(self, node_class: type[T]) -> T | None:
|
|
89
89
|
if isinstance(self, node_class):
|
|
90
90
|
return self
|
|
91
91
|
if self.input is not None:
|
|
@@ -5,7 +5,7 @@ import datetime
|
|
|
5
5
|
import itertools
|
|
6
6
|
import logging
|
|
7
7
|
import sys
|
|
8
|
-
from typing import Any, Callable, Iterator,
|
|
8
|
+
from typing import Any, Callable, Iterator, cast
|
|
9
9
|
|
|
10
10
|
from pixeltable import exprs, func
|
|
11
11
|
|
|
@@ -64,11 +64,11 @@ class FnCallEvaluator(Evaluator):
|
|
|
64
64
|
|
|
65
65
|
fn_call: exprs.FunctionCall
|
|
66
66
|
fn: func.CallableFunction
|
|
67
|
-
scalar_py_fn:
|
|
67
|
+
scalar_py_fn: Callable | None # only set for non-batching CallableFunctions
|
|
68
68
|
|
|
69
69
|
# only set if fn.is_batched
|
|
70
|
-
call_args_queue:
|
|
71
|
-
batch_size:
|
|
70
|
+
call_args_queue: asyncio.Queue[FnCallArgs] | None # FnCallArgs waiting for execution
|
|
71
|
+
batch_size: int | None
|
|
72
72
|
|
|
73
73
|
def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher, exec_ctx: ExecCtx):
|
|
74
74
|
super().__init__(dispatcher, exec_ctx)
|
|
@@ -160,8 +160,8 @@ class FnCallEvaluator(Evaluator):
|
|
|
160
160
|
|
|
161
161
|
def _create_batch_call_args(self, call_args: list[FnCallArgs]) -> FnCallArgs:
|
|
162
162
|
"""Roll call_args into a single batched FnCallArgs"""
|
|
163
|
-
batch_args: list[list[
|
|
164
|
-
batch_kwargs: dict[str, list[
|
|
163
|
+
batch_args: list[list[Any | None]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
|
|
164
|
+
batch_kwargs: dict[str, list[Any | None]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
|
|
165
165
|
assert isinstance(self.fn, func.CallableFunction)
|
|
166
166
|
for i, item in enumerate(call_args):
|
|
167
167
|
for j in range(len(item.args)):
|
|
@@ -4,7 +4,7 @@ import asyncio
|
|
|
4
4
|
import logging
|
|
5
5
|
import traceback
|
|
6
6
|
from types import TracebackType
|
|
7
|
-
from typing import AsyncIterator, Iterable
|
|
7
|
+
from typing import AsyncIterator, Iterable
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
@@ -49,17 +49,17 @@ class ExprEvalNode(ExecNode):
|
|
|
49
49
|
# execution state
|
|
50
50
|
tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
|
|
51
51
|
exc_event: asyncio.Event # set if an exception needs to be propagated
|
|
52
|
-
error:
|
|
52
|
+
error: Exception | None # exception that needs to be propagated
|
|
53
53
|
completed_rows: asyncio.Queue[exprs.DataRow] # rows that have completed evaluation
|
|
54
54
|
completed_event: asyncio.Event # set when completed_rows is non-empty
|
|
55
55
|
input_iter: AsyncIterator[DataRowBatch]
|
|
56
|
-
current_input_batch:
|
|
56
|
+
current_input_batch: DataRowBatch | None # batch from which we're currently consuming rows
|
|
57
57
|
input_row_idx: int # next row to consume from current_input_batch
|
|
58
|
-
next_input_batch:
|
|
58
|
+
next_input_batch: DataRowBatch | None # read-ahead input batch
|
|
59
59
|
avail_input_rows: int # total number across both current_/next_input_batch
|
|
60
60
|
input_complete: bool # True if we've received all input batches
|
|
61
61
|
num_in_flight: int # number of dispatched rows that haven't completed
|
|
62
|
-
row_pos_map:
|
|
62
|
+
row_pos_map: dict[int, int] | None # id(row) -> position of row in input; only set if maintain_input_order
|
|
63
63
|
output_buffer: RowBuffer # holds rows that are ready to be returned, in order
|
|
64
64
|
|
|
65
65
|
# debugging
|
|
@@ -217,9 +217,10 @@ class ExprEvalNode(ExecNode):
|
|
|
217
217
|
|
|
218
218
|
row: exprs.DataRow
|
|
219
219
|
exc_event_aw = asyncio.create_task(self.exc_event.wait(), name='exc_event.wait()')
|
|
220
|
-
input_batch_aw:
|
|
221
|
-
completed_aw:
|
|
220
|
+
input_batch_aw: asyncio.Task | None = None
|
|
221
|
+
completed_aw: asyncio.Task | None = None
|
|
222
222
|
closed_evaluators = False # True after calling Evaluator.close()
|
|
223
|
+
exprs.Expr.prepare_list(self.exec_ctx.all_exprs)
|
|
223
224
|
|
|
224
225
|
try:
|
|
225
226
|
while True:
|
|
@@ -4,7 +4,7 @@ import abc
|
|
|
4
4
|
import asyncio
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from types import TracebackType
|
|
7
|
-
from typing import Any, Iterable,
|
|
7
|
+
from typing import Any, Iterable, Protocol
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
@@ -18,11 +18,11 @@ class FnCallArgs:
|
|
|
18
18
|
fn_call: exprs.FunctionCall
|
|
19
19
|
rows: list[exprs.DataRow]
|
|
20
20
|
# single call
|
|
21
|
-
args:
|
|
22
|
-
kwargs:
|
|
21
|
+
args: list[Any] | None = None
|
|
22
|
+
kwargs: dict[str, Any] | None = None
|
|
23
23
|
# batch call
|
|
24
|
-
batch_args:
|
|
25
|
-
batch_kwargs:
|
|
24
|
+
batch_args: list[list[Any | None]] | None = None
|
|
25
|
+
batch_kwargs: dict[str, list[Any | None]] | None = None
|
|
26
26
|
|
|
27
27
|
@property
|
|
28
28
|
def pxt_fn(self) -> func.CallableFunction:
|
|
@@ -56,7 +56,7 @@ class Scheduler(abc.ABC):
|
|
|
56
56
|
request: FnCallArgs
|
|
57
57
|
num_retries: int
|
|
58
58
|
exec_ctx: ExecCtx
|
|
59
|
-
retry_after:
|
|
59
|
+
retry_after: float | None = None # time.monotonic()
|
|
60
60
|
|
|
61
61
|
def __lt__(self, other: Scheduler.QueueItem) -> bool:
|
|
62
62
|
# prioritize by number of retries (more retries = higher priority)
|