pixeltable 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/_version.py +1 -0
- pixeltable/catalog/catalog.py +144 -118
- pixeltable/catalog/column.py +104 -115
- pixeltable/catalog/globals.py +1 -2
- pixeltable/catalog/insertable_table.py +44 -49
- pixeltable/catalog/path.py +3 -4
- pixeltable/catalog/schema_object.py +4 -4
- pixeltable/catalog/table.py +139 -124
- pixeltable/catalog/table_metadata.py +6 -6
- pixeltable/catalog/table_version.py +315 -246
- pixeltable/catalog/table_version_handle.py +4 -4
- pixeltable/catalog/table_version_path.py +9 -10
- pixeltable/catalog/tbl_ops.py +9 -3
- pixeltable/catalog/view.py +34 -28
- pixeltable/config.py +14 -10
- pixeltable/dataframe.py +69 -78
- pixeltable/env.py +78 -64
- pixeltable/exec/aggregation_node.py +6 -6
- pixeltable/exec/cache_prefetch_node.py +10 -10
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +5 -5
- pixeltable/exec/expr_eval/evaluators.py +6 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
- pixeltable/exec/expr_eval/globals.py +6 -6
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +11 -11
- pixeltable/exec/in_memory_data_node.py +2 -2
- pixeltable/exec/object_store_save_node.py +14 -17
- pixeltable/exec/sql_node.py +28 -27
- pixeltable/exprs/arithmetic_expr.py +4 -4
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/column_property_ref.py +3 -3
- pixeltable/exprs/column_ref.py +61 -74
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +3 -3
- pixeltable/exprs/data_row.py +12 -12
- pixeltable/exprs/expr.py +41 -31
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +3 -3
- pixeltable/exprs/function_call.py +14 -14
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +8 -8
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +6 -6
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +14 -14
- pixeltable/exprs/rowid_ref.py +8 -8
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +2 -2
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +3 -3
- pixeltable/func/function.py +15 -17
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +2 -2
- pixeltable/func/query_template_function.py +16 -16
- pixeltable/func/signature.py +14 -14
- pixeltable/func/tools.py +11 -11
- pixeltable/func/udf.py +16 -18
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +7 -7
- pixeltable/functions/audio.py +76 -0
- pixeltable/functions/bedrock.py +6 -6
- pixeltable/functions/deepseek.py +4 -4
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +6 -6
- pixeltable/functions/globals.py +12 -12
- pixeltable/functions/groq.py +4 -4
- pixeltable/functions/huggingface.py +1033 -6
- pixeltable/functions/image.py +7 -10
- pixeltable/functions/llama_cpp.py +7 -7
- pixeltable/functions/math.py +2 -3
- pixeltable/functions/mistralai.py +3 -3
- pixeltable/functions/ollama.py +9 -9
- pixeltable/functions/openai.py +21 -21
- pixeltable/functions/openrouter.py +7 -7
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +7 -8
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/video.py +36 -31
- pixeltable/functions/vision.py +6 -6
- pixeltable/functions/whisper.py +7 -7
- pixeltable/functions/whisperx.py +16 -16
- pixeltable/globals.py +75 -40
- pixeltable/index/base.py +12 -8
- pixeltable/index/btree.py +19 -22
- pixeltable/index/embedding_index.py +30 -39
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/external_store.py +13 -16
- pixeltable/io/fiftyone.py +5 -5
- pixeltable/io/globals.py +5 -5
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +12 -12
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +12 -12
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/audio.py +2 -2
- pixeltable/iterators/document.py +88 -57
- pixeltable/iterators/video.py +66 -37
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_38.py +2 -2
- pixeltable/metadata/converters/convert_39.py +1 -2
- pixeltable/metadata/converters/util.py +11 -13
- pixeltable/metadata/schema.py +22 -21
- pixeltable/metadata/utils.py +2 -6
- pixeltable/mypy/mypy_plugin.py +5 -5
- pixeltable/plan.py +32 -34
- pixeltable/share/packager.py +7 -7
- pixeltable/share/publish.py +3 -3
- pixeltable/store.py +126 -41
- pixeltable/type_system.py +43 -46
- pixeltable/utils/__init__.py +1 -2
- pixeltable/utils/arrow.py +4 -4
- pixeltable/utils/av.py +74 -38
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +1 -2
- pixeltable/utils/dbms.py +15 -19
- pixeltable/utils/description_helper.py +2 -3
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +2 -2
- pixeltable/utils/filecache.py +5 -5
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +9 -9
- pixeltable/utils/local_store.py +17 -17
- pixeltable/utils/object_stores.py +59 -43
- pixeltable/utils/s3_store.py +35 -30
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/METADATA +4 -4
- pixeltable-0.4.19.dist-info/RECORD +213 -0
- pixeltable/__version__.py +0 -3
- pixeltable-0.4.17.dist-info/RECORD +0 -211
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
pixeltable/env.py
CHANGED
|
@@ -21,7 +21,7 @@ from contextlib import contextmanager
|
|
|
21
21
|
from dataclasses import dataclass, field
|
|
22
22
|
from pathlib import Path
|
|
23
23
|
from sys import stdout
|
|
24
|
-
from typing import TYPE_CHECKING, Any, Callable, Iterator,
|
|
24
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, TypeVar
|
|
25
25
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
26
26
|
|
|
27
27
|
import nest_asyncio # type: ignore[import-untyped]
|
|
@@ -38,7 +38,7 @@ from pixeltable.config import Config
|
|
|
38
38
|
from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
|
|
39
39
|
from pixeltable.utils.dbms import CockroachDbms, Dbms, PostgresqlDbms
|
|
40
40
|
from pixeltable.utils.http_server import make_server
|
|
41
|
-
from pixeltable.utils.object_stores import ObjectPath
|
|
41
|
+
from pixeltable.utils.object_stores import ObjectPath
|
|
42
42
|
|
|
43
43
|
if TYPE_CHECKING:
|
|
44
44
|
import spacy
|
|
@@ -56,47 +56,50 @@ class Env:
|
|
|
56
56
|
For a non-local environment, Pixeltable uses a connection string to the externally managed database.
|
|
57
57
|
"""
|
|
58
58
|
|
|
59
|
-
|
|
59
|
+
SERIALIZABLE_ISOLATION_LEVEL = 'SERIALIZABLE'
|
|
60
|
+
|
|
61
|
+
_instance: Env | None = None
|
|
60
62
|
__initializing: bool = False
|
|
61
63
|
_log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
|
|
62
64
|
|
|
63
|
-
_media_dir:
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
_default_time_zone: Optional[ZoneInfo]
|
|
65
|
+
_media_dir: Path | None
|
|
66
|
+
_file_cache_dir: Path | None # cached object files with external URL
|
|
67
|
+
_dataset_cache_dir: Path | None # cached datasets (eg, pytorch or COCO)
|
|
68
|
+
_log_dir: Path | None # log files
|
|
69
|
+
_tmp_dir: Path | None # any tmp files
|
|
70
|
+
_sa_engine: sql.engine.base.Engine | None
|
|
71
|
+
_pgdata_dir: Path | None
|
|
72
|
+
_db_name: str | None
|
|
73
|
+
_db_server: pixeltable_pgserver.PostgresServer | None # set only when running in local environment
|
|
74
|
+
_db_url: str | None
|
|
75
|
+
_default_time_zone: ZoneInfo | None
|
|
75
76
|
_verbosity: int
|
|
76
77
|
|
|
77
78
|
# info about optional packages that are utilized by some parts of the code
|
|
78
79
|
__optional_packages: dict[str, PackageInfo]
|
|
79
80
|
|
|
80
|
-
_spacy_nlp:
|
|
81
|
-
_httpd:
|
|
82
|
-
_http_address:
|
|
81
|
+
_spacy_nlp: spacy.Language | None
|
|
82
|
+
_httpd: http.server.HTTPServer | None
|
|
83
|
+
_http_address: str | None
|
|
83
84
|
_logger: logging.Logger
|
|
84
85
|
_default_log_level: int
|
|
85
|
-
_logfilename:
|
|
86
|
+
_logfilename: str | None
|
|
86
87
|
_log_to_stdout: bool
|
|
87
88
|
_module_log_level: dict[str, int] # module name -> log level
|
|
88
89
|
_file_cache_size_g: float
|
|
89
|
-
|
|
90
|
+
_default_input_media_dest: str | None
|
|
91
|
+
_default_output_media_dest: str | None
|
|
92
|
+
_pxt_api_key: str | None
|
|
90
93
|
_stdout_handler: logging.StreamHandler
|
|
91
94
|
_default_video_encoder: str | None
|
|
92
95
|
_initialized: bool
|
|
93
96
|
|
|
94
97
|
_resource_pool_info: dict[str, Any]
|
|
95
|
-
_current_conn:
|
|
96
|
-
_current_session:
|
|
97
|
-
_current_isolation_level:
|
|
98
|
-
_dbms:
|
|
99
|
-
_event_loop:
|
|
98
|
+
_current_conn: sql.Connection | None
|
|
99
|
+
_current_session: orm.Session | None
|
|
100
|
+
_current_isolation_level: str | None
|
|
101
|
+
_dbms: Dbms | None
|
|
102
|
+
_event_loop: asyncio.AbstractEventLoop | None # event loop for ExecNode
|
|
100
103
|
|
|
101
104
|
@classmethod
|
|
102
105
|
def get(cls) -> Env:
|
|
@@ -125,7 +128,6 @@ class Env:
|
|
|
125
128
|
assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
|
|
126
129
|
|
|
127
130
|
self._media_dir = None # computed media files
|
|
128
|
-
self._object_soa = None # computed object files in StorageObjectAddress format
|
|
129
131
|
self._file_cache_dir = None # cached object files with external URL
|
|
130
132
|
self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
|
|
131
133
|
self._log_dir = None # log files
|
|
@@ -200,11 +202,11 @@ class Env:
|
|
|
200
202
|
return self._http_address
|
|
201
203
|
|
|
202
204
|
@property
|
|
203
|
-
def user(self) ->
|
|
205
|
+
def user(self) -> str | None:
|
|
204
206
|
return Config.get().get_string_value('user')
|
|
205
207
|
|
|
206
208
|
@user.setter
|
|
207
|
-
def user(self, user:
|
|
209
|
+
def user(self, user: str | None) -> None:
|
|
208
210
|
if user is None:
|
|
209
211
|
if 'PIXELTABLE_USER' in os.environ:
|
|
210
212
|
del os.environ['PIXELTABLE_USER']
|
|
@@ -212,11 +214,11 @@ class Env:
|
|
|
212
214
|
os.environ['PIXELTABLE_USER'] = user
|
|
213
215
|
|
|
214
216
|
@property
|
|
215
|
-
def default_time_zone(self) ->
|
|
217
|
+
def default_time_zone(self) -> ZoneInfo | None:
|
|
216
218
|
return self._default_time_zone
|
|
217
219
|
|
|
218
220
|
@default_time_zone.setter
|
|
219
|
-
def default_time_zone(self, tz:
|
|
221
|
+
def default_time_zone(self, tz: ZoneInfo | None) -> None:
|
|
220
222
|
"""
|
|
221
223
|
This is not a publicly visible setter; it is only for testing purposes.
|
|
222
224
|
"""
|
|
@@ -233,17 +235,17 @@ class Env:
|
|
|
233
235
|
return self._verbosity
|
|
234
236
|
|
|
235
237
|
@property
|
|
236
|
-
def conn(self) ->
|
|
238
|
+
def conn(self) -> sql.Connection | None:
|
|
237
239
|
assert self._current_conn is not None
|
|
238
240
|
return self._current_conn
|
|
239
241
|
|
|
240
242
|
@property
|
|
241
|
-
def session(self) ->
|
|
243
|
+
def session(self) -> orm.Session | None:
|
|
242
244
|
assert self._current_session is not None
|
|
243
245
|
return self._current_session
|
|
244
246
|
|
|
245
247
|
@property
|
|
246
|
-
def dbms(self) ->
|
|
248
|
+
def dbms(self) -> Dbms | None:
|
|
247
249
|
assert self._dbms is not None
|
|
248
250
|
return self._dbms
|
|
249
251
|
|
|
@@ -274,7 +276,7 @@ class Env:
|
|
|
274
276
|
if self._current_conn is None:
|
|
275
277
|
assert self._current_session is None
|
|
276
278
|
try:
|
|
277
|
-
self._current_isolation_level =
|
|
279
|
+
self._current_isolation_level = self.SERIALIZABLE_ISOLATION_LEVEL
|
|
278
280
|
with (
|
|
279
281
|
self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
|
|
280
282
|
orm.Session(conn) as session,
|
|
@@ -289,16 +291,16 @@ class Env:
|
|
|
289
291
|
self._current_isolation_level = None
|
|
290
292
|
else:
|
|
291
293
|
assert self._current_session is not None
|
|
292
|
-
assert
|
|
294
|
+
assert self._current_isolation_level == self.SERIALIZABLE_ISOLATION_LEVEL or not for_write
|
|
293
295
|
yield self._current_conn
|
|
294
296
|
|
|
295
297
|
def configure_logging(
|
|
296
298
|
self,
|
|
297
299
|
*,
|
|
298
|
-
to_stdout:
|
|
299
|
-
level:
|
|
300
|
-
add:
|
|
301
|
-
remove:
|
|
300
|
+
to_stdout: bool | None = None,
|
|
301
|
+
level: int | None = None,
|
|
302
|
+
add: str | None = None,
|
|
303
|
+
remove: str | None = None,
|
|
302
304
|
) -> None:
|
|
303
305
|
"""Configure logging.
|
|
304
306
|
|
|
@@ -340,7 +342,7 @@ class Env:
|
|
|
340
342
|
def set_log_level(self, level: int) -> None:
|
|
341
343
|
self._default_log_level = level
|
|
342
344
|
|
|
343
|
-
def set_module_log_level(self, module: str, level:
|
|
345
|
+
def set_module_log_level(self, module: str, level: int | None) -> None:
|
|
344
346
|
if level is None:
|
|
345
347
|
self._module_log_level.pop(module, None)
|
|
346
348
|
else:
|
|
@@ -396,23 +398,18 @@ class Env:
|
|
|
396
398
|
config = Config.get()
|
|
397
399
|
|
|
398
400
|
self._initialized = True
|
|
401
|
+
|
|
399
402
|
self._media_dir = Config.get().home / 'media'
|
|
400
403
|
self._file_cache_dir = Config.get().home / 'file_cache'
|
|
401
404
|
self._dataset_cache_dir = Config.get().home / 'dataset_cache'
|
|
402
405
|
self._log_dir = Config.get().home / 'logs'
|
|
403
406
|
self._tmp_dir = Config.get().home / 'tmp'
|
|
404
407
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
self.
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
if not self._dataset_cache_dir.exists():
|
|
411
|
-
self._dataset_cache_dir.mkdir()
|
|
412
|
-
if not self._log_dir.exists():
|
|
413
|
-
self._log_dir.mkdir()
|
|
414
|
-
if not self._tmp_dir.exists():
|
|
415
|
-
self._tmp_dir.mkdir()
|
|
408
|
+
self._media_dir.mkdir(exist_ok=True)
|
|
409
|
+
self._file_cache_dir.mkdir(exist_ok=True)
|
|
410
|
+
self._dataset_cache_dir.mkdir(exist_ok=True)
|
|
411
|
+
self._log_dir.mkdir(exist_ok=True)
|
|
412
|
+
self._tmp_dir.mkdir(exist_ok=True)
|
|
416
413
|
|
|
417
414
|
self._file_cache_size_g = config.get_float_value('file_cache_size_g')
|
|
418
415
|
if self._file_cache_size_g is None:
|
|
@@ -421,6 +418,16 @@ class Env:
|
|
|
421
418
|
f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {Config.get().config_file},\n'
|
|
422
419
|
'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
|
|
423
420
|
)
|
|
421
|
+
|
|
422
|
+
self._default_input_media_dest = config.get_string_value('input_media_dest')
|
|
423
|
+
self._default_output_media_dest = config.get_string_value('output_media_dest')
|
|
424
|
+
for mode, uri in (('input', self._default_input_media_dest), ('output', self._default_output_media_dest)):
|
|
425
|
+
if uri is not None:
|
|
426
|
+
try:
|
|
427
|
+
_ = ObjectPath.parse_object_storage_addr(uri, False)
|
|
428
|
+
except Exception as e:
|
|
429
|
+
raise excs.Error(f'Invalid {mode} media destination URI: {uri}') from e
|
|
430
|
+
|
|
424
431
|
self._pxt_api_key = config.get_string_value('api_key')
|
|
425
432
|
|
|
426
433
|
# Disable spurious warnings
|
|
@@ -647,7 +654,7 @@ class Env:
|
|
|
647
654
|
metadata.upgrade_md(self._sa_engine)
|
|
648
655
|
|
|
649
656
|
@property
|
|
650
|
-
def pxt_api_key(self) ->
|
|
657
|
+
def pxt_api_key(self) -> str | None:
|
|
651
658
|
return self._pxt_api_key
|
|
652
659
|
|
|
653
660
|
def get_client(self, name: str) -> Any:
|
|
@@ -669,7 +676,7 @@ class Env:
|
|
|
669
676
|
# Determine the type of the parameter for proper config parsing.
|
|
670
677
|
pname = param.name
|
|
671
678
|
t = param.annotation
|
|
672
|
-
# Deference
|
|
679
|
+
# Deference T | None
|
|
673
680
|
if typing.get_origin(t) in (typing.Union, types.UnionType):
|
|
674
681
|
args = typing.get_args(t)
|
|
675
682
|
if args[0] is type(None):
|
|
@@ -760,17 +767,21 @@ class Env:
|
|
|
760
767
|
|
|
761
768
|
def __register_packages(self) -> None:
|
|
762
769
|
"""Declare optional packages that are utilized by some parts of the code."""
|
|
770
|
+
self.__register_package('accelerate')
|
|
763
771
|
self.__register_package('anthropic')
|
|
764
772
|
self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
|
|
765
773
|
self.__register_package('boto3')
|
|
766
774
|
self.__register_package('datasets')
|
|
775
|
+
self.__register_package('diffusers')
|
|
767
776
|
self.__register_package('fiftyone')
|
|
777
|
+
self.__register_package('twelvelabs')
|
|
768
778
|
self.__register_package('fireworks', library_name='fireworks-ai')
|
|
769
779
|
self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
|
|
770
780
|
self.__register_package('google.genai', library_name='google-genai')
|
|
771
781
|
self.__register_package('groq')
|
|
772
782
|
self.__register_package('huggingface_hub', library_name='huggingface-hub')
|
|
773
783
|
self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
|
|
784
|
+
self.__register_package('librosa')
|
|
774
785
|
self.__register_package('llama_cpp', library_name='llama-cpp-python')
|
|
775
786
|
self.__register_package('mcp')
|
|
776
787
|
self.__register_package('mistralai')
|
|
@@ -783,6 +794,7 @@ class Env:
|
|
|
783
794
|
self.__register_package('replicate')
|
|
784
795
|
self.__register_package('sentencepiece')
|
|
785
796
|
self.__register_package('sentence_transformers', library_name='sentence-transformers')
|
|
797
|
+
self.__register_package('soundfile')
|
|
786
798
|
self.__register_package('spacy')
|
|
787
799
|
self.__register_package('tiktoken')
|
|
788
800
|
self.__register_package('together')
|
|
@@ -795,7 +807,7 @@ class Env:
|
|
|
795
807
|
self.__register_package('yolox', library_name='pixeltable-yolox')
|
|
796
808
|
self.__register_package('lancedb')
|
|
797
809
|
|
|
798
|
-
def __register_package(self, package_name: str, library_name:
|
|
810
|
+
def __register_package(self, package_name: str, library_name: str | None = None) -> None:
|
|
799
811
|
is_installed: bool
|
|
800
812
|
try:
|
|
801
813
|
is_installed = importlib.util.find_spec(package_name) is not None
|
|
@@ -811,7 +823,7 @@ class Env:
|
|
|
811
823
|
if not shutil.which(binary_name):
|
|
812
824
|
raise excs.Error(f'{binary_name} is not installed or not in PATH. Please install it to use this feature.')
|
|
813
825
|
|
|
814
|
-
def require_package(self, package_name: str, min_version:
|
|
826
|
+
def require_package(self, package_name: str, min_version: list[int] | None = None) -> None:
|
|
815
827
|
"""
|
|
816
828
|
Checks whether the specified optional package is available. If not, raises an exception
|
|
817
829
|
with an error message informing the user how to install it.
|
|
@@ -855,8 +867,8 @@ class Env:
|
|
|
855
867
|
else:
|
|
856
868
|
os.remove(path)
|
|
857
869
|
|
|
858
|
-
# def get_resource_pool_info(self, pool_id: str, pool_info_cls:
|
|
859
|
-
def get_resource_pool_info(self, pool_id: str, make_pool_info:
|
|
870
|
+
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Type[T] | None) -> T:
|
|
871
|
+
def get_resource_pool_info(self, pool_id: str, make_pool_info: Callable[[], T] | None = None) -> T:
|
|
860
872
|
"""Returns the info object for the given id, creating it if necessary."""
|
|
861
873
|
info = self._resource_pool_info.get(pool_id)
|
|
862
874
|
if info is None and make_pool_info is not None:
|
|
@@ -870,10 +882,12 @@ class Env:
|
|
|
870
882
|
return self._media_dir
|
|
871
883
|
|
|
872
884
|
@property
|
|
873
|
-
def
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
885
|
+
def default_input_media_dest(self) -> str | None:
|
|
886
|
+
return self._default_input_media_dest
|
|
887
|
+
|
|
888
|
+
@property
|
|
889
|
+
def default_output_media_dest(self) -> str | None:
|
|
890
|
+
return self._default_output_media_dest
|
|
877
891
|
|
|
878
892
|
@property
|
|
879
893
|
def file_cache_dir(self) -> Path:
|
|
@@ -1021,14 +1035,14 @@ _registered_clients: dict[str, ApiClient] = {}
|
|
|
1021
1035
|
class ApiClient:
|
|
1022
1036
|
init_fn: Callable
|
|
1023
1037
|
params: dict[str, inspect.Parameter]
|
|
1024
|
-
client_obj:
|
|
1038
|
+
client_obj: Any | None = None
|
|
1025
1039
|
|
|
1026
1040
|
|
|
1027
1041
|
@dataclass
|
|
1028
1042
|
class PackageInfo:
|
|
1029
1043
|
is_installed: bool
|
|
1030
1044
|
library_name: str # pypi library name (may be different from package name)
|
|
1031
|
-
version:
|
|
1045
|
+
version: list[int] | None = None # installed version, as a list of components (such as [3,0,2] for "3.0.2")
|
|
1032
1046
|
|
|
1033
1047
|
|
|
1034
1048
|
TIME_FORMAT = '%H:%M.%S %f'
|
|
@@ -1089,7 +1103,7 @@ class RateLimitsInfo:
|
|
|
1089
1103
|
"""Update self.resource_limits based on the exception headers"""
|
|
1090
1104
|
self.has_exc = True
|
|
1091
1105
|
|
|
1092
|
-
def get_retry_delay(self, exc: Exception) ->
|
|
1106
|
+
def get_retry_delay(self, exc: Exception) -> float | None:
|
|
1093
1107
|
"""Returns number of seconds to wait before retry, or None if not retryable"""
|
|
1094
1108
|
if len(self.resource_limits) == 0:
|
|
1095
1109
|
return 1.0
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
|
-
from typing import Any, AsyncIterator, Iterable,
|
|
5
|
+
from typing import Any, AsyncIterator, Iterable, cast
|
|
6
6
|
|
|
7
7
|
from pixeltable import catalog, exceptions as excs, exprs
|
|
8
8
|
|
|
@@ -19,18 +19,18 @@ class AggregationNode(ExecNode):
|
|
|
19
19
|
At the moment, this returns all results in a single DataRowBatch.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
|
-
group_by:
|
|
22
|
+
group_by: list[exprs.Expr] | None
|
|
23
23
|
input_exprs: list[exprs.Expr]
|
|
24
24
|
agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
|
|
25
25
|
agg_fn_calls: list[exprs.FunctionCall]
|
|
26
26
|
output_batch: DataRowBatch
|
|
27
|
-
limit:
|
|
27
|
+
limit: int | None
|
|
28
28
|
|
|
29
29
|
def __init__(
|
|
30
30
|
self,
|
|
31
31
|
tbl: catalog.TableVersionHandle,
|
|
32
32
|
row_builder: exprs.RowBuilder,
|
|
33
|
-
group_by:
|
|
33
|
+
group_by: list[exprs.Expr] | None,
|
|
34
34
|
agg_fn_calls: list[exprs.FunctionCall],
|
|
35
35
|
input_exprs: Iterable[exprs.Expr],
|
|
36
36
|
input: ExecNode,
|
|
@@ -72,8 +72,8 @@ class AggregationNode(ExecNode):
|
|
|
72
72
|
raise excs.ExprEvalError(fn_call, expr_msg, exc, exc_tb, input_vals, row_num) from exc
|
|
73
73
|
|
|
74
74
|
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
75
|
-
prev_row:
|
|
76
|
-
current_group:
|
|
75
|
+
prev_row: exprs.DataRow | None = None
|
|
76
|
+
current_group: list[Any] | None = None # the values of the group-by exprs
|
|
77
77
|
num_input_rows = 0
|
|
78
78
|
num_output_rows = 0
|
|
79
79
|
async for row_batch in self.input:
|
|
@@ -9,7 +9,7 @@ import urllib.request
|
|
|
9
9
|
from collections import deque
|
|
10
10
|
from concurrent import futures
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import AsyncIterator, Iterator
|
|
12
|
+
from typing import AsyncIterator, Iterator
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
15
|
from pixeltable import exceptions as excs, exprs
|
|
@@ -43,18 +43,18 @@ class CachePrefetchNode(ExecNode):
|
|
|
43
43
|
|
|
44
44
|
# ready_rows: rows that are ready to be returned, ordered by row idx;
|
|
45
45
|
# the implied row idx of ready_rows[0] is num_returned_rows
|
|
46
|
-
ready_rows: deque[
|
|
46
|
+
ready_rows: deque[exprs.DataRow | None]
|
|
47
47
|
|
|
48
48
|
in_flight_rows: dict[int, CachePrefetchNode.RowState] # rows with in-flight urls; id(row) -> RowState
|
|
49
49
|
in_flight_requests: dict[futures.Future, str] # in-flight requests for urls; future -> URL
|
|
50
50
|
in_flight_urls: dict[str, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]] # URL -> [(row, info)]
|
|
51
51
|
input_finished: bool
|
|
52
|
-
row_idx: Iterator[
|
|
52
|
+
row_idx: Iterator[int | None]
|
|
53
53
|
|
|
54
54
|
@dataclasses.dataclass
|
|
55
55
|
class RowState:
|
|
56
56
|
row: exprs.DataRow
|
|
57
|
-
idx:
|
|
57
|
+
idx: int | None # position in input stream; None if we don't retain input order
|
|
58
58
|
num_missing: int # number of missing URLs in this row
|
|
59
59
|
|
|
60
60
|
def __init__(
|
|
@@ -78,7 +78,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
78
78
|
def queued_work(self) -> int:
|
|
79
79
|
return len(self.in_flight_requests)
|
|
80
80
|
|
|
81
|
-
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) ->
|
|
81
|
+
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> DataRowBatch | None:
|
|
82
82
|
"""Get the next batch of input rows, or None if there are no more rows"""
|
|
83
83
|
try:
|
|
84
84
|
input_batch = await anext(input_iter)
|
|
@@ -127,7 +127,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
127
127
|
sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
|
|
128
128
|
)
|
|
129
129
|
|
|
130
|
-
def __add_ready_row(self, row: exprs.DataRow, row_idx:
|
|
130
|
+
def __add_ready_row(self, row: exprs.DataRow, row_idx: int | None) -> None:
|
|
131
131
|
if row_idx is None:
|
|
132
132
|
self.ready_rows.append(row)
|
|
133
133
|
else:
|
|
@@ -144,12 +144,12 @@ class CachePrefetchNode(ExecNode):
|
|
|
144
144
|
tmp_path, exc = f.result()
|
|
145
145
|
if exc is not None and not ignore_errors:
|
|
146
146
|
raise exc
|
|
147
|
-
local_path:
|
|
147
|
+
local_path: Path | None = None
|
|
148
148
|
if tmp_path is not None:
|
|
149
149
|
# register the file with the cache for the first column in which it's missing
|
|
150
150
|
assert url in self.in_flight_urls
|
|
151
151
|
_, info = self.in_flight_urls[url][0]
|
|
152
|
-
local_path = file_cache.add(info.col.
|
|
152
|
+
local_path = file_cache.add(info.col.get_tbl().id, info.col.id, url, tmp_path)
|
|
153
153
|
_logger.debug(f'cached {url} as {local_path}')
|
|
154
154
|
|
|
155
155
|
# add the local path/exception to the slots that reference the url
|
|
@@ -174,7 +174,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
174
174
|
# the time it takes to get the next batch together
|
|
175
175
|
cache_misses: list[str] = []
|
|
176
176
|
|
|
177
|
-
url_pos: dict[str,
|
|
177
|
+
url_pos: dict[str, int | None] = {} # url -> row_idx; used for logging
|
|
178
178
|
for row in input_batch:
|
|
179
179
|
# identify missing local files in input batch, or fill in their paths if they're already cached
|
|
180
180
|
num_missing = 0
|
|
@@ -213,7 +213,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
213
213
|
_logger.debug(f'submitted {url} for idx {url_pos[url]}')
|
|
214
214
|
self.in_flight_requests[f] = url
|
|
215
215
|
|
|
216
|
-
def __fetch_url(self, url: str) -> tuple[
|
|
216
|
+
def __fetch_url(self, url: str) -> tuple[Path | None, Exception | None]:
|
|
217
217
|
"""Fetches a remote URL into the TempStore and returns its path"""
|
|
218
218
|
from pixeltable.utils.local_store import TempStore
|
|
219
219
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Iterator
|
|
4
|
+
from typing import Iterator
|
|
5
5
|
|
|
6
6
|
from pixeltable import exprs
|
|
7
7
|
|
|
@@ -19,11 +19,11 @@ class DataRowBatch:
|
|
|
19
19
|
row_builder: exprs.RowBuilder
|
|
20
20
|
rows: list[exprs.DataRow]
|
|
21
21
|
|
|
22
|
-
def __init__(self, row_builder: exprs.RowBuilder, rows:
|
|
22
|
+
def __init__(self, row_builder: exprs.RowBuilder, rows: list[exprs.DataRow] | None = None):
|
|
23
23
|
self.row_builder = row_builder
|
|
24
24
|
self.rows = [] if rows is None else rows
|
|
25
25
|
|
|
26
|
-
def add_row(self, row:
|
|
26
|
+
def add_row(self, row: exprs.DataRow | None) -> exprs.DataRow:
|
|
27
27
|
if row is None:
|
|
28
28
|
row = self.row_builder.make_row()
|
|
29
29
|
self.rows.append(row)
|
pixeltable/exec/exec_context.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
import random
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -8,13 +8,24 @@ from pixeltable import exprs
|
|
|
8
8
|
class ExecContext:
|
|
9
9
|
"""Class for execution runtime constants"""
|
|
10
10
|
|
|
11
|
+
row_builder: exprs.RowBuilder
|
|
12
|
+
profile: exprs.ExecProfile
|
|
13
|
+
show_pbar: bool
|
|
14
|
+
batch_size: int
|
|
15
|
+
num_rows: int | None
|
|
16
|
+
conn: sql.engine.Connection | None
|
|
17
|
+
pk_clause: list[sql.ClauseElement] | None
|
|
18
|
+
num_computed_exprs: int
|
|
19
|
+
ignore_errors: bool
|
|
20
|
+
random_seed: int # general-purpose source of randomness with execution scope
|
|
21
|
+
|
|
11
22
|
def __init__(
|
|
12
23
|
self,
|
|
13
24
|
row_builder: exprs.RowBuilder,
|
|
14
25
|
*,
|
|
15
26
|
show_pbar: bool = False,
|
|
16
27
|
batch_size: int = 0,
|
|
17
|
-
pk_clause:
|
|
28
|
+
pk_clause: list[sql.ClauseElement] | None = None,
|
|
18
29
|
num_computed_exprs: int = 0,
|
|
19
30
|
ignore_errors: bool = False,
|
|
20
31
|
):
|
|
@@ -23,8 +34,9 @@ class ExecContext:
|
|
|
23
34
|
self.row_builder = row_builder
|
|
24
35
|
self.profile = exprs.ExecProfile(row_builder)
|
|
25
36
|
# num_rows is used to compute the total number of computed cells used for the progress bar
|
|
26
|
-
self.num_rows
|
|
27
|
-
self.conn
|
|
37
|
+
self.num_rows = None
|
|
38
|
+
self.conn = None # if present, use this to execute SQL queries
|
|
28
39
|
self.pk_clause = pk_clause
|
|
29
40
|
self.num_computed_exprs = num_computed_exprs
|
|
30
41
|
self.ignore_errors = ignore_errors
|
|
42
|
+
self.random_seed = random.randint(0, 1 << 63)
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
4
|
import logging
|
|
5
|
-
from typing import AsyncIterator, Iterable, Iterator,
|
|
5
|
+
from typing import AsyncIterator, Iterable, Iterator, TypeVar
|
|
6
6
|
|
|
7
7
|
from pixeltable import exprs
|
|
8
8
|
from pixeltable.env import Env
|
|
@@ -18,16 +18,16 @@ class ExecNode(abc.ABC):
|
|
|
18
18
|
|
|
19
19
|
output_exprs: Iterable[exprs.Expr]
|
|
20
20
|
row_builder: exprs.RowBuilder
|
|
21
|
-
input:
|
|
21
|
+
input: ExecNode | None
|
|
22
22
|
flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
|
|
23
|
-
ctx:
|
|
23
|
+
ctx: ExecContext | None
|
|
24
24
|
|
|
25
25
|
def __init__(
|
|
26
26
|
self,
|
|
27
27
|
row_builder: exprs.RowBuilder,
|
|
28
28
|
output_exprs: Iterable[exprs.Expr],
|
|
29
29
|
input_exprs: Iterable[exprs.Expr],
|
|
30
|
-
input:
|
|
30
|
+
input: ExecNode | None = None,
|
|
31
31
|
):
|
|
32
32
|
assert all(expr.is_valid for expr in output_exprs)
|
|
33
33
|
self.output_exprs = output_exprs
|
|
@@ -85,7 +85,7 @@ class ExecNode(abc.ABC):
|
|
|
85
85
|
|
|
86
86
|
T = TypeVar('T', bound='ExecNode')
|
|
87
87
|
|
|
88
|
-
def get_node(self, node_class: type[T]) ->
|
|
88
|
+
def get_node(self, node_class: type[T]) -> T | None:
|
|
89
89
|
if isinstance(self, node_class):
|
|
90
90
|
return self
|
|
91
91
|
if self.input is not None:
|
|
@@ -5,7 +5,7 @@ import datetime
|
|
|
5
5
|
import itertools
|
|
6
6
|
import logging
|
|
7
7
|
import sys
|
|
8
|
-
from typing import Any, Callable, Iterator,
|
|
8
|
+
from typing import Any, Callable, Iterator, cast
|
|
9
9
|
|
|
10
10
|
from pixeltable import exprs, func
|
|
11
11
|
|
|
@@ -64,11 +64,11 @@ class FnCallEvaluator(Evaluator):
|
|
|
64
64
|
|
|
65
65
|
fn_call: exprs.FunctionCall
|
|
66
66
|
fn: func.CallableFunction
|
|
67
|
-
scalar_py_fn:
|
|
67
|
+
scalar_py_fn: Callable | None # only set for non-batching CallableFunctions
|
|
68
68
|
|
|
69
69
|
# only set if fn.is_batched
|
|
70
|
-
call_args_queue:
|
|
71
|
-
batch_size:
|
|
70
|
+
call_args_queue: asyncio.Queue[FnCallArgs] | None # FnCallArgs waiting for execution
|
|
71
|
+
batch_size: int | None
|
|
72
72
|
|
|
73
73
|
def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher, exec_ctx: ExecCtx):
|
|
74
74
|
super().__init__(dispatcher, exec_ctx)
|
|
@@ -160,8 +160,8 @@ class FnCallEvaluator(Evaluator):
|
|
|
160
160
|
|
|
161
161
|
def _create_batch_call_args(self, call_args: list[FnCallArgs]) -> FnCallArgs:
|
|
162
162
|
"""Roll call_args into a single batched FnCallArgs"""
|
|
163
|
-
batch_args: list[list[
|
|
164
|
-
batch_kwargs: dict[str, list[
|
|
163
|
+
batch_args: list[list[Any | None]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
|
|
164
|
+
batch_kwargs: dict[str, list[Any | None]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
|
|
165
165
|
assert isinstance(self.fn, func.CallableFunction)
|
|
166
166
|
for i, item in enumerate(call_args):
|
|
167
167
|
for j in range(len(item.args)):
|
|
@@ -4,7 +4,7 @@ import asyncio
|
|
|
4
4
|
import logging
|
|
5
5
|
import traceback
|
|
6
6
|
from types import TracebackType
|
|
7
|
-
from typing import AsyncIterator, Iterable
|
|
7
|
+
from typing import AsyncIterator, Iterable
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
@@ -49,17 +49,17 @@ class ExprEvalNode(ExecNode):
|
|
|
49
49
|
# execution state
|
|
50
50
|
tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
|
|
51
51
|
exc_event: asyncio.Event # set if an exception needs to be propagated
|
|
52
|
-
error:
|
|
52
|
+
error: Exception | None # exception that needs to be propagated
|
|
53
53
|
completed_rows: asyncio.Queue[exprs.DataRow] # rows that have completed evaluation
|
|
54
54
|
completed_event: asyncio.Event # set when completed_rows is non-empty
|
|
55
55
|
input_iter: AsyncIterator[DataRowBatch]
|
|
56
|
-
current_input_batch:
|
|
56
|
+
current_input_batch: DataRowBatch | None # batch from which we're currently consuming rows
|
|
57
57
|
input_row_idx: int # next row to consume from current_input_batch
|
|
58
|
-
next_input_batch:
|
|
58
|
+
next_input_batch: DataRowBatch | None # read-ahead input batch
|
|
59
59
|
avail_input_rows: int # total number across both current_/next_input_batch
|
|
60
60
|
input_complete: bool # True if we've received all input batches
|
|
61
61
|
num_in_flight: int # number of dispatched rows that haven't completed
|
|
62
|
-
row_pos_map:
|
|
62
|
+
row_pos_map: dict[int, int] | None # id(row) -> position of row in input; only set if maintain_input_order
|
|
63
63
|
output_buffer: RowBuffer # holds rows that are ready to be returned, in order
|
|
64
64
|
|
|
65
65
|
# debugging
|
|
@@ -217,9 +217,10 @@ class ExprEvalNode(ExecNode):
|
|
|
217
217
|
|
|
218
218
|
row: exprs.DataRow
|
|
219
219
|
exc_event_aw = asyncio.create_task(self.exc_event.wait(), name='exc_event.wait()')
|
|
220
|
-
input_batch_aw:
|
|
221
|
-
completed_aw:
|
|
220
|
+
input_batch_aw: asyncio.Task | None = None
|
|
221
|
+
completed_aw: asyncio.Task | None = None
|
|
222
222
|
closed_evaluators = False # True after calling Evaluator.close()
|
|
223
|
+
exprs.Expr.prepare_list(self.exec_ctx.all_exprs)
|
|
223
224
|
|
|
224
225
|
try:
|
|
225
226
|
while True:
|