pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +23 -5
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -3
- pixeltable/catalog/catalog.py +1318 -404
- pixeltable/catalog/column.py +186 -115
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +11 -43
- pixeltable/catalog/insertable_table.py +167 -79
- pixeltable/catalog/path.py +61 -23
- pixeltable/catalog/schema_object.py +9 -10
- pixeltable/catalog/table.py +626 -308
- pixeltable/catalog/table_metadata.py +101 -0
- pixeltable/catalog/table_version.py +713 -569
- pixeltable/catalog/table_version_handle.py +37 -6
- pixeltable/catalog/table_version_path.py +42 -29
- pixeltable/catalog/tbl_ops.py +50 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +108 -94
- pixeltable/config.py +128 -22
- pixeltable/dataframe.py +188 -100
- pixeltable/env.py +407 -136
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +3 -0
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +7 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +190 -30
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/in_memory_data_node.py +18 -18
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +206 -101
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +34 -30
- pixeltable/exprs/column_ref.py +92 -96
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +152 -55
- pixeltable/exprs/expr.py +62 -43
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +75 -37
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +10 -27
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +127 -53
- pixeltable/exprs/rowid_ref.py +8 -12
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +10 -10
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +20 -18
- pixeltable/func/signature.py +43 -16
- pixeltable/func/tools.py +23 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +6 -0
- pixeltable/functions/anthropic.py +93 -33
- pixeltable/functions/audio.py +114 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/deepseek.py +20 -9
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +28 -11
- pixeltable/functions/globals.py +13 -13
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1046 -23
- pixeltable/functions/image.py +9 -18
- pixeltable/functions/llama_cpp.py +23 -8
- pixeltable/functions/math.py +3 -4
- pixeltable/functions/mistralai.py +4 -15
- pixeltable/functions/ollama.py +16 -9
- pixeltable/functions/openai.py +104 -82
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +13 -14
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/video.py +1388 -106
- pixeltable/functions/vision.py +7 -7
- pixeltable/functions/whisper.py +15 -7
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +332 -105
- pixeltable/index/base.py +13 -22
- pixeltable/index/btree.py +23 -22
- pixeltable/index/embedding_index.py +32 -44
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +7 -6
- pixeltable/io/external_store.py +49 -77
- pixeltable/io/fiftyone.py +11 -11
- pixeltable/io/globals.py +29 -28
- pixeltable/io/hf_datasets.py +17 -9
- pixeltable/io/label_studio.py +70 -66
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +12 -11
- pixeltable/io/parquet.py +13 -93
- pixeltable/io/table_data_conduit.py +71 -47
- pixeltable/io/utils.py +3 -3
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +21 -11
- pixeltable/iterators/document.py +116 -55
- pixeltable/iterators/image.py +5 -2
- pixeltable/iterators/video.py +293 -13
- pixeltable/metadata/__init__.py +4 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/util.py +13 -12
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +79 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +274 -223
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +259 -129
- pixeltable/share/protocol/__init__.py +34 -0
- pixeltable/share/protocol/common.py +170 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +109 -0
- pixeltable/share/publish.py +213 -57
- pixeltable/store.py +238 -175
- pixeltable/type_system.py +104 -63
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +108 -13
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +528 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +392 -0
- pixeltable-0.4.20.dist-info/METADATA +587 -0
- pixeltable-0.4.20.dist-info/RECORD +218 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
- pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable/utils/sample.py +0 -25
- pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
- pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
- pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/env.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import datetime
|
|
4
5
|
import glob
|
|
5
6
|
import http.server
|
|
@@ -13,19 +14,23 @@ import shutil
|
|
|
13
14
|
import subprocess
|
|
14
15
|
import sys
|
|
15
16
|
import threading
|
|
16
|
-
import
|
|
17
|
+
import types
|
|
18
|
+
import typing
|
|
17
19
|
import warnings
|
|
18
|
-
from abc import abstractmethod
|
|
19
20
|
from contextlib import contextmanager
|
|
20
21
|
from dataclasses import dataclass, field
|
|
21
22
|
from pathlib import Path
|
|
22
23
|
from sys import stdout
|
|
23
|
-
from typing import TYPE_CHECKING, Any, Callable, Iterator,
|
|
24
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, TypeVar
|
|
24
25
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
25
26
|
|
|
27
|
+
import nest_asyncio # type: ignore[import-untyped]
|
|
26
28
|
import pixeltable_pgserver
|
|
27
29
|
import sqlalchemy as sql
|
|
30
|
+
import tzlocal
|
|
28
31
|
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
32
|
+
from sqlalchemy import orm
|
|
33
|
+
from tenacity import retry, stop_after_attempt, wait_exponential_jitter
|
|
29
34
|
from tqdm import TqdmWarning
|
|
30
35
|
|
|
31
36
|
from pixeltable import exceptions as excs
|
|
@@ -33,6 +38,7 @@ from pixeltable.config import Config
|
|
|
33
38
|
from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
|
|
34
39
|
from pixeltable.utils.dbms import CockroachDbms, Dbms, PostgresqlDbms
|
|
35
40
|
from pixeltable.utils.http_server import make_server
|
|
41
|
+
from pixeltable.utils.object_stores import ObjectPath
|
|
36
42
|
|
|
37
43
|
if TYPE_CHECKING:
|
|
38
44
|
import spacy
|
|
@@ -50,42 +56,50 @@ class Env:
|
|
|
50
56
|
For a non-local environment, Pixeltable uses a connection string to the externally managed database.
|
|
51
57
|
"""
|
|
52
58
|
|
|
53
|
-
|
|
59
|
+
SERIALIZABLE_ISOLATION_LEVEL = 'SERIALIZABLE'
|
|
60
|
+
|
|
61
|
+
_instance: Env | None = None
|
|
54
62
|
__initializing: bool = False
|
|
55
63
|
_log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
|
|
56
64
|
|
|
57
|
-
_media_dir:
|
|
58
|
-
_file_cache_dir:
|
|
59
|
-
_dataset_cache_dir:
|
|
60
|
-
_log_dir:
|
|
61
|
-
_tmp_dir:
|
|
62
|
-
_sa_engine:
|
|
63
|
-
_pgdata_dir:
|
|
64
|
-
_db_name:
|
|
65
|
-
_db_server:
|
|
66
|
-
_db_url:
|
|
67
|
-
_default_time_zone:
|
|
65
|
+
_media_dir: Path | None
|
|
66
|
+
_file_cache_dir: Path | None # cached object files with external URL
|
|
67
|
+
_dataset_cache_dir: Path | None # cached datasets (eg, pytorch or COCO)
|
|
68
|
+
_log_dir: Path | None # log files
|
|
69
|
+
_tmp_dir: Path | None # any tmp files
|
|
70
|
+
_sa_engine: sql.engine.base.Engine | None
|
|
71
|
+
_pgdata_dir: Path | None
|
|
72
|
+
_db_name: str | None
|
|
73
|
+
_db_server: pixeltable_pgserver.PostgresServer | None # set only when running in local environment
|
|
74
|
+
_db_url: str | None
|
|
75
|
+
_default_time_zone: ZoneInfo | None
|
|
76
|
+
_verbosity: int
|
|
68
77
|
|
|
69
78
|
# info about optional packages that are utilized by some parts of the code
|
|
70
79
|
__optional_packages: dict[str, PackageInfo]
|
|
71
80
|
|
|
72
|
-
_spacy_nlp:
|
|
73
|
-
_httpd:
|
|
74
|
-
_http_address:
|
|
81
|
+
_spacy_nlp: spacy.Language | None
|
|
82
|
+
_httpd: http.server.HTTPServer | None
|
|
83
|
+
_http_address: str | None
|
|
75
84
|
_logger: logging.Logger
|
|
76
85
|
_default_log_level: int
|
|
77
|
-
_logfilename:
|
|
86
|
+
_logfilename: str | None
|
|
78
87
|
_log_to_stdout: bool
|
|
79
88
|
_module_log_level: dict[str, int] # module name -> log level
|
|
80
89
|
_file_cache_size_g: float
|
|
81
|
-
|
|
90
|
+
_default_input_media_dest: str | None
|
|
91
|
+
_default_output_media_dest: str | None
|
|
92
|
+
_pxt_api_key: str | None
|
|
82
93
|
_stdout_handler: logging.StreamHandler
|
|
94
|
+
_default_video_encoder: str | None
|
|
83
95
|
_initialized: bool
|
|
84
96
|
|
|
85
97
|
_resource_pool_info: dict[str, Any]
|
|
86
|
-
_current_conn:
|
|
87
|
-
_current_session:
|
|
88
|
-
|
|
98
|
+
_current_conn: sql.Connection | None
|
|
99
|
+
_current_session: orm.Session | None
|
|
100
|
+
_current_isolation_level: str | None
|
|
101
|
+
_dbms: Dbms | None
|
|
102
|
+
_event_loop: asyncio.AbstractEventLoop | None # event loop for ExecNode
|
|
89
103
|
|
|
90
104
|
@classmethod
|
|
91
105
|
def get(cls) -> Env:
|
|
@@ -97,17 +111,24 @@ class Env:
|
|
|
97
111
|
def _init_env(cls, reinit_db: bool = False) -> None:
|
|
98
112
|
assert not cls.__initializing, 'Circular env initialization detected.'
|
|
99
113
|
cls.__initializing = True
|
|
114
|
+
if cls._instance is not None:
|
|
115
|
+
cls._instance._clean_up()
|
|
116
|
+
cls._instance = None
|
|
100
117
|
env = Env()
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
118
|
+
try:
|
|
119
|
+
env._set_up(reinit_db=reinit_db)
|
|
120
|
+
env._upgrade_metadata()
|
|
121
|
+
cls._instance = env
|
|
122
|
+
finally:
|
|
123
|
+
# Reset the initializing flag, even if setup fails.
|
|
124
|
+
# This prevents the environment from being left in a broken state.
|
|
125
|
+
cls.__initializing = False
|
|
105
126
|
|
|
106
127
|
def __init__(self) -> None:
|
|
107
128
|
assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
|
|
108
129
|
|
|
109
130
|
self._media_dir = None # computed media files
|
|
110
|
-
self._file_cache_dir = None # cached
|
|
131
|
+
self._file_cache_dir = None # cached object files with external URL
|
|
111
132
|
self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
|
|
112
133
|
self._log_dir = None # log files
|
|
113
134
|
self._tmp_dir = None # any tmp files
|
|
@@ -121,6 +142,7 @@ class Env:
|
|
|
121
142
|
self._spacy_nlp = None
|
|
122
143
|
self._httpd = None
|
|
123
144
|
self._http_address = None
|
|
145
|
+
self._default_video_encoder = None
|
|
124
146
|
|
|
125
147
|
# logging-related state
|
|
126
148
|
self._logger = logging.getLogger('pixeltable')
|
|
@@ -140,7 +162,34 @@ class Env:
|
|
|
140
162
|
self._resource_pool_info = {}
|
|
141
163
|
self._current_conn = None
|
|
142
164
|
self._current_session = None
|
|
165
|
+
self._current_isolation_level = None
|
|
143
166
|
self._dbms = None
|
|
167
|
+
self._event_loop = None
|
|
168
|
+
|
|
169
|
+
def _init_event_loop(self) -> None:
|
|
170
|
+
try:
|
|
171
|
+
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
172
|
+
# multiple run_until_complete()
|
|
173
|
+
running_loop = asyncio.get_running_loop()
|
|
174
|
+
self._event_loop = running_loop
|
|
175
|
+
_logger.debug('Patched running loop')
|
|
176
|
+
except RuntimeError:
|
|
177
|
+
self._event_loop = asyncio.new_event_loop()
|
|
178
|
+
asyncio.set_event_loop(self._event_loop)
|
|
179
|
+
# we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
|
|
180
|
+
self._event_loop.slow_callback_duration = 3600
|
|
181
|
+
|
|
182
|
+
# always allow nested event loops, we need that to run async udfs synchronously (eg, for SimilarityExpr);
|
|
183
|
+
# see run_coroutine_synchronously()
|
|
184
|
+
nest_asyncio.apply()
|
|
185
|
+
if _logger.isEnabledFor(logging.DEBUG):
|
|
186
|
+
self._event_loop.set_debug(True)
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def event_loop(self) -> asyncio.AbstractEventLoop:
|
|
190
|
+
if self._event_loop is None:
|
|
191
|
+
self._init_event_loop()
|
|
192
|
+
return self._event_loop
|
|
144
193
|
|
|
145
194
|
@property
|
|
146
195
|
def db_url(self) -> str:
|
|
@@ -153,11 +202,11 @@ class Env:
|
|
|
153
202
|
return self._http_address
|
|
154
203
|
|
|
155
204
|
@property
|
|
156
|
-
def user(self) ->
|
|
205
|
+
def user(self) -> str | None:
|
|
157
206
|
return Config.get().get_string_value('user')
|
|
158
207
|
|
|
159
208
|
@user.setter
|
|
160
|
-
def user(self, user:
|
|
209
|
+
def user(self, user: str | None) -> None:
|
|
161
210
|
if user is None:
|
|
162
211
|
if 'PIXELTABLE_USER' in os.environ:
|
|
163
212
|
del os.environ['PIXELTABLE_USER']
|
|
@@ -165,33 +214,46 @@ class Env:
|
|
|
165
214
|
os.environ['PIXELTABLE_USER'] = user
|
|
166
215
|
|
|
167
216
|
@property
|
|
168
|
-
def default_time_zone(self) ->
|
|
217
|
+
def default_time_zone(self) -> ZoneInfo | None:
|
|
169
218
|
return self._default_time_zone
|
|
170
219
|
|
|
171
220
|
@default_time_zone.setter
|
|
172
|
-
def default_time_zone(self, tz:
|
|
221
|
+
def default_time_zone(self, tz: ZoneInfo | None) -> None:
|
|
173
222
|
"""
|
|
174
223
|
This is not a publicly visible setter; it is only for testing purposes.
|
|
175
224
|
"""
|
|
176
|
-
|
|
225
|
+
if tz is None:
|
|
226
|
+
tz_name = self._get_tz_name()
|
|
227
|
+
else:
|
|
228
|
+
assert isinstance(tz, ZoneInfo)
|
|
229
|
+
tz_name = tz.key
|
|
177
230
|
self.engine.dispose()
|
|
178
231
|
self._create_engine(time_zone_name=tz_name)
|
|
179
232
|
|
|
180
233
|
@property
|
|
181
|
-
def
|
|
234
|
+
def verbosity(self) -> int:
|
|
235
|
+
return self._verbosity
|
|
236
|
+
|
|
237
|
+
@property
|
|
238
|
+
def conn(self) -> sql.Connection | None:
|
|
182
239
|
assert self._current_conn is not None
|
|
183
240
|
return self._current_conn
|
|
184
241
|
|
|
185
242
|
@property
|
|
186
|
-
def session(self) ->
|
|
243
|
+
def session(self) -> orm.Session | None:
|
|
187
244
|
assert self._current_session is not None
|
|
188
245
|
return self._current_session
|
|
189
246
|
|
|
190
247
|
@property
|
|
191
|
-
def dbms(self) ->
|
|
248
|
+
def dbms(self) -> Dbms | None:
|
|
192
249
|
assert self._dbms is not None
|
|
193
250
|
return self._dbms
|
|
194
251
|
|
|
252
|
+
@property
|
|
253
|
+
def is_using_cockroachdb(self) -> bool:
|
|
254
|
+
assert self._dbms is not None
|
|
255
|
+
return isinstance(self._dbms, CockroachDbms)
|
|
256
|
+
|
|
195
257
|
@property
|
|
196
258
|
def in_xact(self) -> bool:
|
|
197
259
|
return self._current_conn is not None
|
|
@@ -202,29 +264,43 @@ class Env:
|
|
|
202
264
|
return self._db_server is not None
|
|
203
265
|
|
|
204
266
|
@contextmanager
|
|
205
|
-
def begin_xact(self) -> Iterator[sql.Connection]:
|
|
206
|
-
"""
|
|
267
|
+
def begin_xact(self, *, for_write: bool = False) -> Iterator[sql.Connection]:
|
|
268
|
+
"""
|
|
269
|
+
Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
|
|
270
|
+
|
|
271
|
+
for_write: if True, uses serializable isolation; if False, uses repeatable_read
|
|
272
|
+
|
|
273
|
+
TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
|
|
274
|
+
that avoids tripping over any pending ops
|
|
275
|
+
"""
|
|
207
276
|
if self._current_conn is None:
|
|
208
277
|
assert self._current_session is None
|
|
209
278
|
try:
|
|
210
|
-
|
|
279
|
+
self._current_isolation_level = self.SERIALIZABLE_ISOLATION_LEVEL
|
|
280
|
+
with (
|
|
281
|
+
self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
|
|
282
|
+
orm.Session(conn) as session,
|
|
283
|
+
conn.begin(),
|
|
284
|
+
):
|
|
211
285
|
self._current_conn = conn
|
|
212
286
|
self._current_session = session
|
|
213
287
|
yield conn
|
|
214
288
|
finally:
|
|
215
289
|
self._current_session = None
|
|
216
290
|
self._current_conn = None
|
|
291
|
+
self._current_isolation_level = None
|
|
217
292
|
else:
|
|
218
293
|
assert self._current_session is not None
|
|
294
|
+
assert self._current_isolation_level == self.SERIALIZABLE_ISOLATION_LEVEL or not for_write
|
|
219
295
|
yield self._current_conn
|
|
220
296
|
|
|
221
297
|
def configure_logging(
|
|
222
298
|
self,
|
|
223
299
|
*,
|
|
224
|
-
to_stdout:
|
|
225
|
-
level:
|
|
226
|
-
add:
|
|
227
|
-
remove:
|
|
300
|
+
to_stdout: bool | None = None,
|
|
301
|
+
level: int | None = None,
|
|
302
|
+
add: str | None = None,
|
|
303
|
+
remove: str | None = None,
|
|
228
304
|
) -> None:
|
|
229
305
|
"""Configure logging.
|
|
230
306
|
|
|
@@ -266,7 +342,7 @@ class Env:
|
|
|
266
342
|
def set_log_level(self, level: int) -> None:
|
|
267
343
|
self._default_log_level = level
|
|
268
344
|
|
|
269
|
-
def set_module_log_level(self, module: str, level:
|
|
345
|
+
def set_module_log_level(self, module: str, level: int | None) -> None:
|
|
270
346
|
if level is None:
|
|
271
347
|
self._module_log_level.pop(module, None)
|
|
272
348
|
else:
|
|
@@ -281,6 +357,8 @@ class Env:
|
|
|
281
357
|
# accept log messages from a configured pixeltable module (at any level of the module hierarchy)
|
|
282
358
|
path_parts = list(Path(record.pathname).parts)
|
|
283
359
|
path_parts.reverse()
|
|
360
|
+
if 'pixeltable' not in path_parts:
|
|
361
|
+
return False
|
|
284
362
|
max_idx = path_parts.index('pixeltable')
|
|
285
363
|
for module_name in path_parts[:max_idx]:
|
|
286
364
|
if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
|
|
@@ -291,6 +369,26 @@ class Env:
|
|
|
291
369
|
def console_logger(self) -> ConsoleLogger:
|
|
292
370
|
return self._console_logger
|
|
293
371
|
|
|
372
|
+
def _get_tz_name(self) -> str:
|
|
373
|
+
"""Get the time zone name from the configuration, or the system local time zone if not specified.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
str: The time zone name.
|
|
377
|
+
"""
|
|
378
|
+
tz_name = Config.get().get_string_value('time_zone')
|
|
379
|
+
if tz_name is not None:
|
|
380
|
+
# Validate tzname
|
|
381
|
+
if not isinstance(tz_name, str):
|
|
382
|
+
self._logger.error('Invalid time zone specified in configuration.')
|
|
383
|
+
else:
|
|
384
|
+
try:
|
|
385
|
+
_ = ZoneInfo(tz_name)
|
|
386
|
+
except ZoneInfoNotFoundError:
|
|
387
|
+
self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
|
|
388
|
+
else:
|
|
389
|
+
tz_name = tzlocal.get_localzone_name()
|
|
390
|
+
return tz_name
|
|
391
|
+
|
|
294
392
|
def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
|
|
295
393
|
if self._initialized:
|
|
296
394
|
return
|
|
@@ -300,22 +398,18 @@ class Env:
|
|
|
300
398
|
config = Config.get()
|
|
301
399
|
|
|
302
400
|
self._initialized = True
|
|
401
|
+
|
|
303
402
|
self._media_dir = Config.get().home / 'media'
|
|
304
403
|
self._file_cache_dir = Config.get().home / 'file_cache'
|
|
305
404
|
self._dataset_cache_dir = Config.get().home / 'dataset_cache'
|
|
306
405
|
self._log_dir = Config.get().home / 'logs'
|
|
307
406
|
self._tmp_dir = Config.get().home / 'tmp'
|
|
308
407
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
self._dataset_cache_dir.mkdir()
|
|
315
|
-
if not self._log_dir.exists():
|
|
316
|
-
self._log_dir.mkdir()
|
|
317
|
-
if not self._tmp_dir.exists():
|
|
318
|
-
self._tmp_dir.mkdir()
|
|
408
|
+
self._media_dir.mkdir(exist_ok=True)
|
|
409
|
+
self._file_cache_dir.mkdir(exist_ok=True)
|
|
410
|
+
self._dataset_cache_dir.mkdir(exist_ok=True)
|
|
411
|
+
self._log_dir.mkdir(exist_ok=True)
|
|
412
|
+
self._tmp_dir.mkdir(exist_ok=True)
|
|
319
413
|
|
|
320
414
|
self._file_cache_size_g = config.get_float_value('file_cache_size_g')
|
|
321
415
|
if self._file_cache_size_g is None:
|
|
@@ -324,6 +418,16 @@ class Env:
|
|
|
324
418
|
f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {Config.get().config_file},\n'
|
|
325
419
|
'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
|
|
326
420
|
)
|
|
421
|
+
|
|
422
|
+
self._default_input_media_dest = config.get_string_value('input_media_dest')
|
|
423
|
+
self._default_output_media_dest = config.get_string_value('output_media_dest')
|
|
424
|
+
for mode, uri in (('input', self._default_input_media_dest), ('output', self._default_output_media_dest)):
|
|
425
|
+
if uri is not None:
|
|
426
|
+
try:
|
|
427
|
+
_ = ObjectPath.parse_object_storage_addr(uri, False)
|
|
428
|
+
except Exception as e:
|
|
429
|
+
raise excs.Error(f'Invalid {mode} media destination URI: {uri}') from e
|
|
430
|
+
|
|
327
431
|
self._pxt_api_key = config.get_string_value('api_key')
|
|
328
432
|
|
|
329
433
|
# Disable spurious warnings
|
|
@@ -333,10 +437,12 @@ class Env:
|
|
|
333
437
|
warnings.simplefilter('ignore', category=UserWarning)
|
|
334
438
|
warnings.simplefilter('ignore', category=FutureWarning)
|
|
335
439
|
|
|
336
|
-
# Set
|
|
337
|
-
|
|
440
|
+
# Set verbosity level for user visible console messages
|
|
441
|
+
self._verbosity = config.get_int_value('verbosity')
|
|
442
|
+
if self._verbosity is None:
|
|
443
|
+
self._verbosity = 1
|
|
338
444
|
stdout_handler = ConsoleOutputHandler(stream=stdout)
|
|
339
|
-
stdout_handler.setLevel(
|
|
445
|
+
stdout_handler.setLevel(map_level(self._verbosity))
|
|
340
446
|
stdout_handler.addFilter(ConsoleMessageFilter())
|
|
341
447
|
self._logger.addHandler(stdout_handler)
|
|
342
448
|
self._console_logger = ConsoleLogger(self._logger)
|
|
@@ -370,6 +476,7 @@ class Env:
|
|
|
370
476
|
http_logger.propagate = False
|
|
371
477
|
|
|
372
478
|
self.clear_tmp_dir()
|
|
479
|
+
tz_name = self._get_tz_name()
|
|
373
480
|
|
|
374
481
|
# configure pixeltable database
|
|
375
482
|
self._init_db(config)
|
|
@@ -379,22 +486,10 @@ class Env:
|
|
|
379
486
|
'Reinitializing pixeltable database is not supported when running in non-local environment'
|
|
380
487
|
)
|
|
381
488
|
|
|
382
|
-
tz_name = config.get_string_value('time_zone')
|
|
383
|
-
if tz_name is not None:
|
|
384
|
-
# Validate tzname
|
|
385
|
-
if not isinstance(tz_name, str):
|
|
386
|
-
self._logger.error('Invalid time zone specified in configuration.')
|
|
387
|
-
else:
|
|
388
|
-
try:
|
|
389
|
-
_ = ZoneInfo(tz_name)
|
|
390
|
-
except ZoneInfoNotFoundError:
|
|
391
|
-
self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
|
|
392
|
-
|
|
393
489
|
if reinit_db and self._store_db_exists():
|
|
394
490
|
self._drop_store_db()
|
|
395
491
|
|
|
396
492
|
create_db = not self._store_db_exists()
|
|
397
|
-
|
|
398
493
|
if create_db:
|
|
399
494
|
self._logger.info(f'creating database at: {self.db_url}')
|
|
400
495
|
self._create_store_db()
|
|
@@ -439,7 +534,7 @@ class Env:
|
|
|
439
534
|
raise excs.Error(error)
|
|
440
535
|
self._logger.info(f'Using database at: {self.db_url}')
|
|
441
536
|
else:
|
|
442
|
-
self._db_name =
|
|
537
|
+
self._db_name = config.get_string_value('db') or 'pixeltable'
|
|
443
538
|
self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(Config.get().home / 'pgdata')))
|
|
444
539
|
# cleanup_mode=None will leave the postgres process running after Python exits
|
|
445
540
|
# cleanup_mode='stop' will terminate the postgres process when Python exits
|
|
@@ -453,30 +548,49 @@ class Env:
|
|
|
453
548
|
assert self._db_url is not None
|
|
454
549
|
assert self._db_name is not None
|
|
455
550
|
|
|
551
|
+
@retry(
|
|
552
|
+
stop=stop_after_attempt(3), # Stop after 3 attempts
|
|
553
|
+
wait=wait_exponential_jitter(initial=0.2, max=1.0, jitter=0.2), # Exponential backoff with jitter
|
|
554
|
+
)
|
|
456
555
|
def _init_metadata(self) -> None:
|
|
457
556
|
"""
|
|
458
557
|
Create pixeltable metadata tables and system metadata.
|
|
459
558
|
This is an idempotent operation.
|
|
559
|
+
|
|
560
|
+
Retry logic handles race conditions when multiple Pixeltable processes
|
|
561
|
+
attempt to initialize metadata tables simultaneously. The first process may succeed
|
|
562
|
+
in creating tables while others encounter database constraints (e.g., "table already exists").
|
|
563
|
+
Exponential backoff with jitter reduces contention between competing processes.
|
|
460
564
|
"""
|
|
461
565
|
assert self._sa_engine is not None
|
|
462
566
|
from pixeltable import metadata
|
|
463
567
|
|
|
568
|
+
self._logger.debug('Creating pixeltable metadata')
|
|
464
569
|
metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
|
|
465
570
|
metadata.create_system_info(self._sa_engine)
|
|
466
571
|
|
|
467
|
-
def _create_engine(self, time_zone_name:
|
|
468
|
-
connect_args = {
|
|
572
|
+
def _create_engine(self, time_zone_name: str, echo: bool = False) -> None:
|
|
573
|
+
connect_args = {'options': f'-c timezone={time_zone_name}'}
|
|
574
|
+
self._logger.info(f'Creating SQLAlchemy engine with connection arguments: {connect_args}')
|
|
469
575
|
self._sa_engine = sql.create_engine(
|
|
470
576
|
self.db_url, echo=echo, isolation_level=self._dbms.transaction_isolation_level, connect_args=connect_args
|
|
471
577
|
)
|
|
472
578
|
|
|
473
579
|
self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
|
|
580
|
+
self._logger.info(f'Engine dialect: {self._sa_engine.dialect.name}')
|
|
581
|
+
self._logger.info(f'Engine driver : {self._sa_engine.dialect.driver}')
|
|
474
582
|
|
|
475
583
|
with self.engine.begin() as conn:
|
|
476
584
|
tz_name = conn.execute(sql.text('SHOW TIME ZONE')).scalar()
|
|
477
585
|
assert isinstance(tz_name, str)
|
|
478
586
|
self._logger.info(f'Database time zone is now: {tz_name}')
|
|
479
587
|
self._default_time_zone = ZoneInfo(tz_name)
|
|
588
|
+
if self.is_using_cockroachdb:
|
|
589
|
+
# This could be set when the database is created, but we set it now
|
|
590
|
+
conn.execute(sql.text('SET null_ordered_last = true;'))
|
|
591
|
+
null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
|
|
592
|
+
assert isinstance(null_ordered_last, str)
|
|
593
|
+
self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
|
|
480
594
|
|
|
481
595
|
def _store_db_exists(self) -> bool:
|
|
482
596
|
assert self._db_name is not None
|
|
@@ -511,6 +625,14 @@ class Env:
|
|
|
511
625
|
finally:
|
|
512
626
|
engine.dispose()
|
|
513
627
|
|
|
628
|
+
def _pgserver_terminate_connections_stmt(self) -> str:
|
|
629
|
+
return f"""
|
|
630
|
+
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
631
|
+
FROM pg_stat_activity
|
|
632
|
+
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
633
|
+
AND pid <> pg_backend_pid()
|
|
634
|
+
"""
|
|
635
|
+
|
|
514
636
|
def _drop_store_db(self) -> None:
|
|
515
637
|
assert self._db_name is not None
|
|
516
638
|
engine = sql.create_engine(self._dbms.default_system_db_url(), future=True, isolation_level='AUTOCOMMIT')
|
|
@@ -519,13 +641,7 @@ class Env:
|
|
|
519
641
|
with engine.begin() as conn:
|
|
520
642
|
# terminate active connections
|
|
521
643
|
if self._db_server is not None:
|
|
522
|
-
|
|
523
|
-
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
524
|
-
FROM pg_stat_activity
|
|
525
|
-
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
526
|
-
AND pid <> pg_backend_pid()
|
|
527
|
-
"""
|
|
528
|
-
conn.execute(sql.text(stmt))
|
|
644
|
+
conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
|
|
529
645
|
# drop db
|
|
530
646
|
stmt = self._dbms.drop_db_stmt(preparer.quote(self._db_name))
|
|
531
647
|
conn.execute(sql.text(stmt))
|
|
@@ -538,12 +654,7 @@ class Env:
|
|
|
538
654
|
metadata.upgrade_md(self._sa_engine)
|
|
539
655
|
|
|
540
656
|
@property
|
|
541
|
-
def pxt_api_key(self) -> str:
|
|
542
|
-
if self._pxt_api_key is None:
|
|
543
|
-
raise excs.Error(
|
|
544
|
-
'No API key is configured. Set the PIXELTABLE_API_KEY environment variable, or add an entry to '
|
|
545
|
-
'config.toml as described here:\nhttps://pixeltable.github.io/pixeltable/config/'
|
|
546
|
-
)
|
|
657
|
+
def pxt_api_key(self) -> str | None:
|
|
547
658
|
return self._pxt_api_key
|
|
548
659
|
|
|
549
660
|
def get_client(self, name: str) -> Any:
|
|
@@ -553,35 +664,51 @@ class Env:
|
|
|
553
664
|
Args:
|
|
554
665
|
- name: The name of the client
|
|
555
666
|
"""
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
667
|
+
# Return the existing client if it has already been constructed
|
|
668
|
+
with _registered_clients_lock:
|
|
669
|
+
cl = _registered_clients[name]
|
|
670
|
+
if cl.client_obj is not None:
|
|
671
|
+
return cl.client_obj # Already initialized
|
|
672
|
+
|
|
673
|
+
# Retrieve parameters required to construct the requested client.
|
|
674
|
+
init_kwargs: dict[str, Any] = {}
|
|
675
|
+
for param in cl.params.values():
|
|
676
|
+
# Determine the type of the parameter for proper config parsing.
|
|
677
|
+
pname = param.name
|
|
678
|
+
t = param.annotation
|
|
679
|
+
# Deference T | None
|
|
680
|
+
if typing.get_origin(t) in (typing.Union, types.UnionType):
|
|
681
|
+
args = typing.get_args(t)
|
|
682
|
+
if args[0] is type(None):
|
|
683
|
+
t = args[1]
|
|
684
|
+
elif args[1] is type(None):
|
|
685
|
+
t = args[0]
|
|
686
|
+
assert isinstance(t, type), t
|
|
687
|
+
arg: Any = Config.get().get_value(pname, t, section=name)
|
|
688
|
+
if arg is not None:
|
|
689
|
+
init_kwargs[pname] = arg
|
|
690
|
+
elif param.default is inspect.Parameter.empty:
|
|
568
691
|
raise excs.Error(
|
|
569
|
-
f'`{name}` client not initialized: parameter `{
|
|
570
|
-
f'To fix this, specify the `{name.upper()}_{
|
|
571
|
-
f'or put `{
|
|
692
|
+
f'`{name}` client not initialized: parameter `{pname}` is not configured.\n'
|
|
693
|
+
f'To fix this, specify the `{name.upper()}_{pname.upper()}` environment variable, '
|
|
694
|
+
f'or put `{pname.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
572
695
|
)
|
|
573
696
|
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
697
|
+
# Construct the requested client
|
|
698
|
+
with _registered_clients_lock:
|
|
699
|
+
if cl.client_obj is not None:
|
|
700
|
+
return cl.client_obj # Already initialized
|
|
701
|
+
cl.client_obj = cl.init_fn(**init_kwargs)
|
|
702
|
+
self._logger.info(f'Initialized `{name}` client with parameters: {init_kwargs}.')
|
|
703
|
+
return cl.client_obj
|
|
577
704
|
|
|
578
705
|
def _start_web_server(self) -> None:
|
|
579
706
|
"""
|
|
580
707
|
The http server root is the file system root.
|
|
581
708
|
eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
|
|
582
|
-
|
|
583
|
-
This arrangement enables serving
|
|
584
|
-
as well as external
|
|
709
|
+
On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
|
|
710
|
+
This arrangement enables serving objects hosted within _home,
|
|
711
|
+
as well as external objects inserted into pixeltable or produced by pixeltable.
|
|
585
712
|
The port is chosen dynamically to prevent conflicts.
|
|
586
713
|
"""
|
|
587
714
|
# Port 0 means OS picks one for us.
|
|
@@ -603,17 +730,60 @@ class Env:
|
|
|
603
730
|
self._start_web_server()
|
|
604
731
|
self.__register_packages()
|
|
605
732
|
|
|
733
|
+
@property
|
|
734
|
+
def default_video_encoder(self) -> str | None:
|
|
735
|
+
if self._default_video_encoder is None:
|
|
736
|
+
self._default_video_encoder = self._determine_default_video_encoder()
|
|
737
|
+
return self._default_video_encoder
|
|
738
|
+
|
|
739
|
+
def _determine_default_video_encoder(self) -> str | None:
|
|
740
|
+
"""
|
|
741
|
+
Returns the first available encoder from a list of candidates.
|
|
742
|
+
|
|
743
|
+
TODO:
|
|
744
|
+
- the user might prefer a hardware-accelerated encoder (eg, h264_nvenc or h264_videotoolbox)
|
|
745
|
+
- allow user override via a config option 'video_encoder'
|
|
746
|
+
"""
|
|
747
|
+
# look for available encoders, in this order
|
|
748
|
+
candidates = [
|
|
749
|
+
'libx264', # GPL, best quality
|
|
750
|
+
'libopenh264', # BSD
|
|
751
|
+
]
|
|
752
|
+
|
|
753
|
+
try:
|
|
754
|
+
# Get list of available encoders
|
|
755
|
+
result = subprocess.run(['ffmpeg', '-encoders'], capture_output=True, text=True, timeout=10, check=True)
|
|
756
|
+
|
|
757
|
+
if result.returncode == 0:
|
|
758
|
+
available_encoders = result.stdout
|
|
759
|
+
for encoder in candidates:
|
|
760
|
+
# ffmpeg -encoders output format: " V..... encoder_name description"
|
|
761
|
+
if f' {encoder} ' in available_encoders:
|
|
762
|
+
_logger.debug(f'Using H.264 encoder: {encoder}')
|
|
763
|
+
return encoder
|
|
764
|
+
except Exception:
|
|
765
|
+
pass
|
|
766
|
+
return None
|
|
767
|
+
|
|
606
768
|
def __register_packages(self) -> None:
|
|
607
769
|
"""Declare optional packages that are utilized by some parts of the code."""
|
|
770
|
+
self.__register_package('accelerate')
|
|
608
771
|
self.__register_package('anthropic')
|
|
772
|
+
self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
|
|
609
773
|
self.__register_package('boto3')
|
|
610
774
|
self.__register_package('datasets')
|
|
775
|
+
self.__register_package('diffusers')
|
|
611
776
|
self.__register_package('fiftyone')
|
|
777
|
+
self.__register_package('twelvelabs')
|
|
612
778
|
self.__register_package('fireworks', library_name='fireworks-ai')
|
|
779
|
+
self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
|
|
613
780
|
self.__register_package('google.genai', library_name='google-genai')
|
|
781
|
+
self.__register_package('groq')
|
|
614
782
|
self.__register_package('huggingface_hub', library_name='huggingface-hub')
|
|
615
783
|
self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
|
|
784
|
+
self.__register_package('librosa')
|
|
616
785
|
self.__register_package('llama_cpp', library_name='llama-cpp-python')
|
|
786
|
+
self.__register_package('mcp')
|
|
617
787
|
self.__register_package('mistralai')
|
|
618
788
|
self.__register_package('mistune')
|
|
619
789
|
self.__register_package('ollama')
|
|
@@ -622,8 +792,10 @@ class Env:
|
|
|
622
792
|
self.__register_package('pyarrow')
|
|
623
793
|
self.__register_package('pydantic')
|
|
624
794
|
self.__register_package('replicate')
|
|
795
|
+
self.__register_package('reve')
|
|
625
796
|
self.__register_package('sentencepiece')
|
|
626
797
|
self.__register_package('sentence_transformers', library_name='sentence-transformers')
|
|
798
|
+
self.__register_package('soundfile')
|
|
627
799
|
self.__register_package('spacy')
|
|
628
800
|
self.__register_package('tiktoken')
|
|
629
801
|
self.__register_package('together')
|
|
@@ -634,8 +806,10 @@ class Env:
|
|
|
634
806
|
self.__register_package('whisper', library_name='openai-whisper')
|
|
635
807
|
self.__register_package('whisperx')
|
|
636
808
|
self.__register_package('yolox', library_name='pixeltable-yolox')
|
|
809
|
+
self.__register_package('lancedb')
|
|
810
|
+
self.__register_package('scenedetect')
|
|
637
811
|
|
|
638
|
-
def __register_package(self, package_name: str, library_name:
|
|
812
|
+
def __register_package(self, package_name: str, library_name: str | None = None) -> None:
|
|
639
813
|
is_installed: bool
|
|
640
814
|
try:
|
|
641
815
|
is_installed = importlib.util.find_spec(package_name) is not None
|
|
@@ -647,7 +821,11 @@ class Env:
|
|
|
647
821
|
library_name=library_name or package_name, # defaults to package_name unless specified otherwise
|
|
648
822
|
)
|
|
649
823
|
|
|
650
|
-
def
|
|
824
|
+
def require_binary(self, binary_name: str) -> None:
|
|
825
|
+
if not shutil.which(binary_name):
|
|
826
|
+
raise excs.Error(f'{binary_name} is not installed or not in PATH. Please install it to use this feature.')
|
|
827
|
+
|
|
828
|
+
def require_package(self, package_name: str, min_version: list[int] | None = None) -> None:
|
|
651
829
|
"""
|
|
652
830
|
Checks whether the specified optional package is available. If not, raises an exception
|
|
653
831
|
with an error message informing the user how to install it.
|
|
@@ -691,14 +869,8 @@ class Env:
|
|
|
691
869
|
else:
|
|
692
870
|
os.remove(path)
|
|
693
871
|
|
|
694
|
-
def
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
def create_tmp_path(self, extension: str = '') -> Path:
|
|
698
|
-
return self._tmp_dir / f'{uuid.uuid4()}{extension}'
|
|
699
|
-
|
|
700
|
-
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
|
|
701
|
-
def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
|
|
872
|
+
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Type[T] | None) -> T:
|
|
873
|
+
def get_resource_pool_info(self, pool_id: str, make_pool_info: Callable[[], T] | None = None) -> T:
|
|
702
874
|
"""Returns the info object for the given id, creating it if necessary."""
|
|
703
875
|
info = self._resource_pool_info.get(pool_id)
|
|
704
876
|
if info is None and make_pool_info is not None:
|
|
@@ -711,6 +883,14 @@ class Env:
|
|
|
711
883
|
assert self._media_dir is not None
|
|
712
884
|
return self._media_dir
|
|
713
885
|
|
|
886
|
+
@property
|
|
887
|
+
def default_input_media_dest(self) -> str | None:
|
|
888
|
+
return self._default_input_media_dest
|
|
889
|
+
|
|
890
|
+
@property
|
|
891
|
+
def default_output_media_dest(self) -> str | None:
|
|
892
|
+
return self._default_output_media_dest
|
|
893
|
+
|
|
714
894
|
@property
|
|
715
895
|
def file_cache_dir(self) -> Path:
|
|
716
896
|
assert self._file_cache_dir is not None
|
|
@@ -746,24 +926,74 @@ class Env:
|
|
|
746
926
|
have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
|
|
747
927
|
"""
|
|
748
928
|
import spacy
|
|
749
|
-
from spacy.cli.download import
|
|
929
|
+
from spacy.cli.download import download
|
|
750
930
|
|
|
751
931
|
spacy_model = 'en_core_web_sm'
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
url = f'{spacy.about.__download_url__}/{filename}'
|
|
755
|
-
# Try to `pip install` the model. We set check=False; if the pip command fails, it's not necessarily
|
|
756
|
-
# a problem, because the model might have been installed on a previous attempt.
|
|
757
|
-
self._logger.info(f'Ensuring spaCy model is installed: {filename}')
|
|
758
|
-
ret = subprocess.run([sys.executable, '-m', 'pip', 'install', '-qU', url], check=False)
|
|
759
|
-
if ret.returncode != 0:
|
|
760
|
-
self._logger.warning(f'pip install failed for spaCy model: {filename}')
|
|
932
|
+
self._logger.info(f'Ensuring spaCy model is installed: {spacy_model}')
|
|
933
|
+
download(spacy_model)
|
|
761
934
|
self._logger.info(f'Loading spaCy model: {spacy_model}')
|
|
762
935
|
try:
|
|
763
936
|
self._spacy_nlp = spacy.load(spacy_model)
|
|
764
937
|
except Exception as exc:
|
|
765
938
|
raise excs.Error(f'Failed to load spaCy model: {spacy_model}') from exc
|
|
766
939
|
|
|
940
|
+
def _clean_up(self) -> None:
|
|
941
|
+
"""
|
|
942
|
+
Internal cleanup method that properly closes all resources and resets state.
|
|
943
|
+
This is called before destroying the singleton instance.
|
|
944
|
+
"""
|
|
945
|
+
assert self._current_session is None
|
|
946
|
+
assert self._current_conn is None
|
|
947
|
+
|
|
948
|
+
# Stop HTTP server
|
|
949
|
+
if self._httpd is not None:
|
|
950
|
+
try:
|
|
951
|
+
self._httpd.shutdown()
|
|
952
|
+
self._httpd.server_close()
|
|
953
|
+
except Exception as e:
|
|
954
|
+
_logger.warning(f'Error stopping HTTP server: {e}')
|
|
955
|
+
|
|
956
|
+
# First terminate all connections to the database
|
|
957
|
+
if self._db_server is not None:
|
|
958
|
+
assert self._dbms is not None
|
|
959
|
+
assert self._db_name is not None
|
|
960
|
+
try:
|
|
961
|
+
temp_engine = sql.create_engine(self._dbms.default_system_db_url(), isolation_level='AUTOCOMMIT')
|
|
962
|
+
try:
|
|
963
|
+
with temp_engine.begin() as conn:
|
|
964
|
+
conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
|
|
965
|
+
_logger.info(f"Terminated all connections to database '{self._db_name}'")
|
|
966
|
+
except Exception as e:
|
|
967
|
+
_logger.warning(f'Error terminating database connections: {e}')
|
|
968
|
+
finally:
|
|
969
|
+
temp_engine.dispose()
|
|
970
|
+
except Exception as e:
|
|
971
|
+
_logger.warning(f'Error stopping database server: {e}')
|
|
972
|
+
|
|
973
|
+
# Dispose of SQLAlchemy engine (after stopping db server)
|
|
974
|
+
if self._sa_engine is not None:
|
|
975
|
+
try:
|
|
976
|
+
self._sa_engine.dispose()
|
|
977
|
+
except Exception as e:
|
|
978
|
+
_logger.warning(f'Error disposing engine: {e}')
|
|
979
|
+
|
|
980
|
+
# Close event loop
|
|
981
|
+
if self._event_loop is not None:
|
|
982
|
+
try:
|
|
983
|
+
if self._event_loop.is_running():
|
|
984
|
+
self._event_loop.stop()
|
|
985
|
+
self._event_loop.close()
|
|
986
|
+
except Exception as e:
|
|
987
|
+
_logger.warning(f'Error closing event loop: {e}')
|
|
988
|
+
|
|
989
|
+
# Remove logging handlers
|
|
990
|
+
for handler in self._logger.handlers[:]:
|
|
991
|
+
try:
|
|
992
|
+
handler.close()
|
|
993
|
+
self._logger.removeHandler(handler)
|
|
994
|
+
except Exception as e:
|
|
995
|
+
_logger.warning(f'Error removing handler: {e}')
|
|
996
|
+
|
|
767
997
|
|
|
768
998
|
def register_client(name: str) -> Callable:
|
|
769
999
|
"""Decorator that registers a third-party API client for use by Pixeltable.
|
|
@@ -792,27 +1022,29 @@ def register_client(name: str) -> Callable:
|
|
|
792
1022
|
|
|
793
1023
|
def decorator(fn: Callable) -> None:
|
|
794
1024
|
sig = inspect.signature(fn)
|
|
795
|
-
|
|
796
|
-
|
|
1025
|
+
params = dict(sig.parameters)
|
|
1026
|
+
with _registered_clients_lock:
|
|
1027
|
+
_registered_clients[name] = ApiClient(init_fn=fn, params=params)
|
|
797
1028
|
|
|
798
1029
|
return decorator
|
|
799
1030
|
|
|
800
1031
|
|
|
1032
|
+
_registered_clients_lock: threading.Lock = threading.Lock()
|
|
801
1033
|
_registered_clients: dict[str, ApiClient] = {}
|
|
802
1034
|
|
|
803
1035
|
|
|
804
1036
|
@dataclass
|
|
805
1037
|
class ApiClient:
|
|
806
1038
|
init_fn: Callable
|
|
807
|
-
|
|
808
|
-
client_obj:
|
|
1039
|
+
params: dict[str, inspect.Parameter]
|
|
1040
|
+
client_obj: Any | None = None
|
|
809
1041
|
|
|
810
1042
|
|
|
811
1043
|
@dataclass
|
|
812
1044
|
class PackageInfo:
|
|
813
1045
|
is_installed: bool
|
|
814
1046
|
library_name: str # pypi library name (may be different from package name)
|
|
815
|
-
version:
|
|
1047
|
+
version: list[int] | None = None # installed version, as a list of components (such as [3,0,2] for "3.0.2")
|
|
816
1048
|
|
|
817
1049
|
|
|
818
1050
|
TIME_FORMAT = '%H:%M.%S %f'
|
|
@@ -838,6 +1070,10 @@ class RateLimitsInfo:
|
|
|
838
1070
|
get_request_resources: Callable[..., dict[str, int]]
|
|
839
1071
|
|
|
840
1072
|
resource_limits: dict[str, RateLimitInfo] = field(default_factory=dict)
|
|
1073
|
+
has_exc: bool = False
|
|
1074
|
+
|
|
1075
|
+
def debug_str(self) -> str:
|
|
1076
|
+
return ','.join(info.debug_str() for info in self.resource_limits.values())
|
|
841
1077
|
|
|
842
1078
|
def is_initialized(self) -> bool:
|
|
843
1079
|
return len(self.resource_limits) > 0
|
|
@@ -845,7 +1081,7 @@ class RateLimitsInfo:
|
|
|
845
1081
|
def reset(self) -> None:
|
|
846
1082
|
self.resource_limits.clear()
|
|
847
1083
|
|
|
848
|
-
def record(self, **kwargs: Any) -> None:
|
|
1084
|
+
def record(self, reset_exc: bool = False, **kwargs: Any) -> None:
|
|
849
1085
|
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
850
1086
|
if len(self.resource_limits) == 0:
|
|
851
1087
|
self.resource_limits = {k: RateLimitInfo(k, now, *v) for k, v in kwargs.items() if v is not None}
|
|
@@ -856,14 +1092,30 @@ class RateLimitsInfo:
|
|
|
856
1092
|
f'reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
|
|
857
1093
|
)
|
|
858
1094
|
else:
|
|
1095
|
+
if self.has_exc and not reset_exc:
|
|
1096
|
+
# ignore updates until we're asked to reset
|
|
1097
|
+
_logger.debug(f'rate_limits.record(): ignoring update {kwargs}')
|
|
1098
|
+
return
|
|
1099
|
+
self.has_exc = False
|
|
859
1100
|
for k, v in kwargs.items():
|
|
860
1101
|
if v is not None:
|
|
861
1102
|
self.resource_limits[k].update(now, *v)
|
|
862
1103
|
|
|
863
|
-
|
|
864
|
-
|
|
1104
|
+
def record_exc(self, exc: Exception) -> None:
|
|
1105
|
+
"""Update self.resource_limits based on the exception headers"""
|
|
1106
|
+
self.has_exc = True
|
|
1107
|
+
|
|
1108
|
+
def get_retry_delay(self, exc: Exception) -> float | None:
|
|
865
1109
|
"""Returns number of seconds to wait before retry, or None if not retryable"""
|
|
866
|
-
|
|
1110
|
+
if len(self.resource_limits) == 0:
|
|
1111
|
+
return 1.0
|
|
1112
|
+
# we're looking for the maximum delay across all depleted resources
|
|
1113
|
+
max_delay = 0.0
|
|
1114
|
+
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
1115
|
+
for limit_info in self.resource_limits.values():
|
|
1116
|
+
if limit_info.remaining < 0.05 * limit_info.limit:
|
|
1117
|
+
max_delay = max(max_delay, (limit_info.reset_at - now).total_seconds())
|
|
1118
|
+
return max_delay if max_delay > 0 else None
|
|
867
1119
|
|
|
868
1120
|
|
|
869
1121
|
@dataclass
|
|
@@ -876,9 +1128,15 @@ class RateLimitInfo:
|
|
|
876
1128
|
remaining: int
|
|
877
1129
|
reset_at: datetime.datetime
|
|
878
1130
|
|
|
1131
|
+
def debug_str(self) -> str:
|
|
1132
|
+
return (
|
|
1133
|
+
f'{self.resource}@{self.recorded_at.strftime(TIME_FORMAT)}: '
|
|
1134
|
+
f'{self.limit}/{self.remaining}/{self.reset_at.strftime(TIME_FORMAT)}'
|
|
1135
|
+
)
|
|
1136
|
+
|
|
879
1137
|
def update(self, recorded_at: datetime.datetime, limit: int, remaining: int, reset_at: datetime.datetime) -> None:
|
|
880
1138
|
# we always update everything, even though responses may come back out-of-order: we can't use reset_at to
|
|
881
|
-
# determine order, because it doesn't increase monotonically (the
|
|
1139
|
+
# determine order, because it doesn't increase monotonically (the reset duration shortens as output_tokens
|
|
882
1140
|
# are freed up - going from max to actual)
|
|
883
1141
|
self.recorded_at = recorded_at
|
|
884
1142
|
self.limit = limit
|
|
@@ -890,3 +1148,16 @@ class RateLimitInfo:
|
|
|
890
1148
|
f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} '
|
|
891
1149
|
f'reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
|
|
892
1150
|
)
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
@dataclass
|
|
1154
|
+
class RuntimeCtx:
|
|
1155
|
+
"""
|
|
1156
|
+
Container for runtime data provided by the execution system to udfs.
|
|
1157
|
+
|
|
1158
|
+
Udfs that accept the special _runtime_ctx parameter receive an instance of this class.
|
|
1159
|
+
"""
|
|
1160
|
+
|
|
1161
|
+
# Indicates a retry attempt following a rate limit error (error code: 429). Requires a 'rate-limits' resource pool.
|
|
1162
|
+
# If True, call RateLimitsInfo.record() with reset_exc=True.
|
|
1163
|
+
is_retry: bool = False
|