pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +42 -8
- pixeltable/{dataframe.py → _query.py} +470 -206
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -4
- pixeltable/catalog/catalog.py +1785 -432
- pixeltable/catalog/column.py +190 -113
- pixeltable/catalog/dir.py +2 -4
- pixeltable/catalog/globals.py +19 -46
- pixeltable/catalog/insertable_table.py +191 -98
- pixeltable/catalog/path.py +63 -23
- pixeltable/catalog/schema_object.py +11 -15
- pixeltable/catalog/table.py +843 -436
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +978 -657
- pixeltable/catalog/table_version_handle.py +72 -16
- pixeltable/catalog/table_version_path.py +112 -43
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +134 -90
- pixeltable/config.py +134 -22
- pixeltable/env.py +471 -157
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +4 -1
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +11 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +106 -56
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +19 -19
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +351 -84
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +36 -23
- pixeltable/exprs/column_ref.py +213 -89
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +164 -54
- pixeltable/exprs/expr.py +70 -44
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +100 -40
- pixeltable/exprs/globals.py +2 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +18 -32
- pixeltable/exprs/is_null.py +7 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +27 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +167 -67
- pixeltable/exprs/rowid_ref.py +25 -10
- pixeltable/exprs/similarity_expr.py +58 -40
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +17 -11
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +29 -27
- pixeltable/func/signature.py +46 -19
- pixeltable/func/tools.py +31 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +16 -0
- pixeltable/functions/anthropic.py +123 -77
- pixeltable/functions/audio.py +147 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +7 -4
- pixeltable/functions/deepseek.py +35 -43
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +11 -20
- pixeltable/functions/gemini.py +195 -39
- pixeltable/functions/globals.py +142 -14
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1056 -24
- pixeltable/functions/image.py +115 -57
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +28 -13
- pixeltable/functions/math.py +67 -5
- pixeltable/functions/mistralai.py +18 -55
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +20 -13
- pixeltable/functions/openai.py +240 -226
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +4 -4
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +239 -69
- pixeltable/functions/timestamp.py +16 -16
- pixeltable/functions/together.py +24 -84
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1515 -107
- pixeltable/functions/vision.py +8 -8
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +16 -8
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +362 -115
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +28 -22
- pixeltable/index/embedding_index.py +100 -118
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +8 -7
- pixeltable/io/external_store.py +56 -105
- pixeltable/io/fiftyone.py +13 -13
- pixeltable/io/globals.py +31 -30
- pixeltable/io/hf_datasets.py +61 -16
- pixeltable/io/label_studio.py +74 -70
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +21 -12
- pixeltable/io/parquet.py +25 -105
- pixeltable/io/table_data_conduit.py +250 -123
- pixeltable/io/utils.py +4 -4
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +26 -25
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +112 -78
- pixeltable/iterators/image.py +12 -15
- pixeltable/iterators/string.py +11 -4
- pixeltable/iterators/video.py +523 -120
- pixeltable/metadata/__init__.py +14 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_30.py +34 -21
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +20 -31
- pixeltable/metadata/notes.py +9 -0
- pixeltable/metadata/schema.py +140 -53
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +382 -115
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +547 -83
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +257 -59
- pixeltable/store.py +311 -194
- pixeltable/type_system.py +373 -211
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +131 -17
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +6 -6
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +32 -6
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +7 -18
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +86 -48
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +26 -0
- pixeltable/utils/system.py +30 -0
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -40
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable-0.3.14.dist-info/METADATA +0 -434
- pixeltable-0.3.14.dist-info/RECORD +0 -186
- pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/env.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import datetime
|
|
4
5
|
import glob
|
|
5
6
|
import http.server
|
|
@@ -7,24 +8,30 @@ import importlib
|
|
|
7
8
|
import importlib.util
|
|
8
9
|
import inspect
|
|
9
10
|
import logging
|
|
11
|
+
import math
|
|
10
12
|
import os
|
|
11
13
|
import platform
|
|
12
14
|
import shutil
|
|
13
15
|
import subprocess
|
|
14
16
|
import sys
|
|
15
17
|
import threading
|
|
16
|
-
import
|
|
18
|
+
import types
|
|
19
|
+
import typing
|
|
17
20
|
import warnings
|
|
18
|
-
from abc import abstractmethod
|
|
19
21
|
from contextlib import contextmanager
|
|
20
22
|
from dataclasses import dataclass, field
|
|
21
23
|
from pathlib import Path
|
|
22
24
|
from sys import stdout
|
|
23
|
-
from typing import TYPE_CHECKING, Any, Callable, Iterator,
|
|
25
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, TypeVar
|
|
24
26
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
25
27
|
|
|
28
|
+
import nest_asyncio # type: ignore[import-untyped]
|
|
26
29
|
import pixeltable_pgserver
|
|
27
30
|
import sqlalchemy as sql
|
|
31
|
+
import tzlocal
|
|
32
|
+
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
33
|
+
from sqlalchemy import orm
|
|
34
|
+
from tenacity import retry, stop_after_attempt, wait_exponential_jitter
|
|
28
35
|
from tqdm import TqdmWarning
|
|
29
36
|
|
|
30
37
|
from pixeltable import exceptions as excs
|
|
@@ -32,6 +39,8 @@ from pixeltable.config import Config
|
|
|
32
39
|
from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
|
|
33
40
|
from pixeltable.utils.dbms import CockroachDbms, Dbms, PostgresqlDbms
|
|
34
41
|
from pixeltable.utils.http_server import make_server
|
|
42
|
+
from pixeltable.utils.object_stores import ObjectPath
|
|
43
|
+
from pixeltable.utils.sql import add_option_to_db_url
|
|
35
44
|
|
|
36
45
|
if TYPE_CHECKING:
|
|
37
46
|
import spacy
|
|
@@ -49,42 +58,50 @@ class Env:
|
|
|
49
58
|
For a non-local environment, Pixeltable uses a connection string to the externally managed database.
|
|
50
59
|
"""
|
|
51
60
|
|
|
52
|
-
|
|
61
|
+
SERIALIZABLE_ISOLATION_LEVEL = 'SERIALIZABLE'
|
|
62
|
+
|
|
63
|
+
_instance: Env | None = None
|
|
53
64
|
__initializing: bool = False
|
|
54
65
|
_log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
|
|
55
66
|
|
|
56
|
-
_media_dir:
|
|
57
|
-
_file_cache_dir:
|
|
58
|
-
_dataset_cache_dir:
|
|
59
|
-
_log_dir:
|
|
60
|
-
_tmp_dir:
|
|
61
|
-
_sa_engine:
|
|
62
|
-
_pgdata_dir:
|
|
63
|
-
_db_name:
|
|
64
|
-
_db_server:
|
|
65
|
-
_db_url:
|
|
66
|
-
_default_time_zone:
|
|
67
|
+
_media_dir: Path | None
|
|
68
|
+
_file_cache_dir: Path | None # cached object files with external URL
|
|
69
|
+
_dataset_cache_dir: Path | None # cached datasets (eg, pytorch or COCO)
|
|
70
|
+
_log_dir: Path | None # log files
|
|
71
|
+
_tmp_dir: Path | None # any tmp files
|
|
72
|
+
_sa_engine: sql.engine.base.Engine | None
|
|
73
|
+
_pgdata_dir: Path | None
|
|
74
|
+
_db_name: str | None
|
|
75
|
+
_db_server: pixeltable_pgserver.PostgresServer | None # set only when running in local environment
|
|
76
|
+
_db_url: str | None
|
|
77
|
+
_default_time_zone: ZoneInfo | None
|
|
78
|
+
_verbosity: int
|
|
67
79
|
|
|
68
80
|
# info about optional packages that are utilized by some parts of the code
|
|
69
81
|
__optional_packages: dict[str, PackageInfo]
|
|
70
82
|
|
|
71
|
-
_spacy_nlp:
|
|
72
|
-
_httpd:
|
|
73
|
-
_http_address:
|
|
83
|
+
_spacy_nlp: spacy.Language | None
|
|
84
|
+
_httpd: http.server.HTTPServer | None
|
|
85
|
+
_http_address: str | None
|
|
74
86
|
_logger: logging.Logger
|
|
75
87
|
_default_log_level: int
|
|
76
|
-
_logfilename:
|
|
88
|
+
_logfilename: str | None
|
|
77
89
|
_log_to_stdout: bool
|
|
78
90
|
_module_log_level: dict[str, int] # module name -> log level
|
|
79
91
|
_file_cache_size_g: float
|
|
80
|
-
|
|
92
|
+
_default_input_media_dest: str | None
|
|
93
|
+
_default_output_media_dest: str | None
|
|
94
|
+
_pxt_api_key: str | None
|
|
81
95
|
_stdout_handler: logging.StreamHandler
|
|
96
|
+
_default_video_encoder: str | None
|
|
82
97
|
_initialized: bool
|
|
83
98
|
|
|
84
99
|
_resource_pool_info: dict[str, Any]
|
|
85
|
-
_current_conn:
|
|
86
|
-
_current_session:
|
|
87
|
-
|
|
100
|
+
_current_conn: sql.Connection | None
|
|
101
|
+
_current_session: orm.Session | None
|
|
102
|
+
_current_isolation_level: str | None
|
|
103
|
+
_dbms: Dbms | None
|
|
104
|
+
_event_loop: asyncio.AbstractEventLoop | None # event loop for ExecNode
|
|
88
105
|
|
|
89
106
|
@classmethod
|
|
90
107
|
def get(cls) -> Env:
|
|
@@ -96,17 +113,24 @@ class Env:
|
|
|
96
113
|
def _init_env(cls, reinit_db: bool = False) -> None:
|
|
97
114
|
assert not cls.__initializing, 'Circular env initialization detected.'
|
|
98
115
|
cls.__initializing = True
|
|
116
|
+
if cls._instance is not None:
|
|
117
|
+
cls._instance._clean_up()
|
|
118
|
+
cls._instance = None
|
|
99
119
|
env = Env()
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
120
|
+
try:
|
|
121
|
+
env._set_up(reinit_db=reinit_db)
|
|
122
|
+
env._upgrade_metadata()
|
|
123
|
+
cls._instance = env
|
|
124
|
+
finally:
|
|
125
|
+
# Reset the initializing flag, even if setup fails.
|
|
126
|
+
# This prevents the environment from being left in a broken state.
|
|
127
|
+
cls.__initializing = False
|
|
104
128
|
|
|
105
129
|
def __init__(self) -> None:
|
|
106
130
|
assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
|
|
107
131
|
|
|
108
132
|
self._media_dir = None # computed media files
|
|
109
|
-
self._file_cache_dir = None # cached
|
|
133
|
+
self._file_cache_dir = None # cached object files with external URL
|
|
110
134
|
self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
|
|
111
135
|
self._log_dir = None # log files
|
|
112
136
|
self._tmp_dir = None # any tmp files
|
|
@@ -120,6 +144,7 @@ class Env:
|
|
|
120
144
|
self._spacy_nlp = None
|
|
121
145
|
self._httpd = None
|
|
122
146
|
self._http_address = None
|
|
147
|
+
self._default_video_encoder = None
|
|
123
148
|
|
|
124
149
|
# logging-related state
|
|
125
150
|
self._logger = logging.getLogger('pixeltable')
|
|
@@ -139,7 +164,34 @@ class Env:
|
|
|
139
164
|
self._resource_pool_info = {}
|
|
140
165
|
self._current_conn = None
|
|
141
166
|
self._current_session = None
|
|
167
|
+
self._current_isolation_level = None
|
|
142
168
|
self._dbms = None
|
|
169
|
+
self._event_loop = None
|
|
170
|
+
|
|
171
|
+
def _init_event_loop(self) -> None:
|
|
172
|
+
try:
|
|
173
|
+
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
174
|
+
# multiple run_until_complete()
|
|
175
|
+
running_loop = asyncio.get_running_loop()
|
|
176
|
+
self._event_loop = running_loop
|
|
177
|
+
_logger.debug('Patched running loop')
|
|
178
|
+
except RuntimeError:
|
|
179
|
+
self._event_loop = asyncio.new_event_loop()
|
|
180
|
+
asyncio.set_event_loop(self._event_loop)
|
|
181
|
+
# we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
|
|
182
|
+
self._event_loop.slow_callback_duration = 3600
|
|
183
|
+
|
|
184
|
+
# always allow nested event loops, we need that to run async udfs synchronously (eg, for SimilarityExpr);
|
|
185
|
+
# see run_coroutine_synchronously()
|
|
186
|
+
nest_asyncio.apply()
|
|
187
|
+
if _logger.isEnabledFor(logging.DEBUG):
|
|
188
|
+
self._event_loop.set_debug(True)
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def event_loop(self) -> asyncio.AbstractEventLoop:
|
|
192
|
+
if self._event_loop is None:
|
|
193
|
+
self._init_event_loop()
|
|
194
|
+
return self._event_loop
|
|
143
195
|
|
|
144
196
|
@property
|
|
145
197
|
def db_url(self) -> str:
|
|
@@ -152,11 +204,11 @@ class Env:
|
|
|
152
204
|
return self._http_address
|
|
153
205
|
|
|
154
206
|
@property
|
|
155
|
-
def user(self) ->
|
|
207
|
+
def user(self) -> str | None:
|
|
156
208
|
return Config.get().get_string_value('user')
|
|
157
209
|
|
|
158
210
|
@user.setter
|
|
159
|
-
def user(self, user:
|
|
211
|
+
def user(self, user: str | None) -> None:
|
|
160
212
|
if user is None:
|
|
161
213
|
if 'PIXELTABLE_USER' in os.environ:
|
|
162
214
|
del os.environ['PIXELTABLE_USER']
|
|
@@ -164,33 +216,47 @@ class Env:
|
|
|
164
216
|
os.environ['PIXELTABLE_USER'] = user
|
|
165
217
|
|
|
166
218
|
@property
|
|
167
|
-
def default_time_zone(self) ->
|
|
219
|
+
def default_time_zone(self) -> ZoneInfo | None:
|
|
168
220
|
return self._default_time_zone
|
|
169
221
|
|
|
170
222
|
@default_time_zone.setter
|
|
171
|
-
def default_time_zone(self, tz:
|
|
223
|
+
def default_time_zone(self, tz: ZoneInfo | None) -> None:
|
|
172
224
|
"""
|
|
173
225
|
This is not a publicly visible setter; it is only for testing purposes.
|
|
174
226
|
"""
|
|
175
|
-
|
|
227
|
+
if tz is None:
|
|
228
|
+
tz_name = self._get_tz_name()
|
|
229
|
+
else:
|
|
230
|
+
assert isinstance(tz, ZoneInfo)
|
|
231
|
+
tz_name = tz.key
|
|
176
232
|
self.engine.dispose()
|
|
177
233
|
self._create_engine(time_zone_name=tz_name)
|
|
178
234
|
|
|
179
235
|
@property
|
|
180
|
-
def
|
|
236
|
+
def verbosity(self) -> int:
|
|
237
|
+
return self._verbosity
|
|
238
|
+
|
|
239
|
+
@property
|
|
240
|
+
def conn(self) -> sql.Connection | None:
|
|
181
241
|
assert self._current_conn is not None
|
|
182
242
|
return self._current_conn
|
|
183
243
|
|
|
184
244
|
@property
|
|
185
|
-
def session(self) ->
|
|
245
|
+
def session(self) -> orm.Session | None:
|
|
186
246
|
assert self._current_session is not None
|
|
187
247
|
return self._current_session
|
|
188
248
|
|
|
189
249
|
@property
|
|
190
|
-
def dbms(self) ->
|
|
250
|
+
def dbms(self) -> Dbms | None:
|
|
191
251
|
assert self._dbms is not None
|
|
192
252
|
return self._dbms
|
|
193
253
|
|
|
254
|
+
@property
|
|
255
|
+
def is_using_cockroachdb(self) -> bool:
|
|
256
|
+
assert self._dbms is not None
|
|
257
|
+
return isinstance(self._dbms, CockroachDbms)
|
|
258
|
+
|
|
259
|
+
@property
|
|
194
260
|
def in_xact(self) -> bool:
|
|
195
261
|
return self._current_conn is not None
|
|
196
262
|
|
|
@@ -200,32 +266,43 @@ class Env:
|
|
|
200
266
|
return self._db_server is not None
|
|
201
267
|
|
|
202
268
|
@contextmanager
|
|
203
|
-
def begin_xact(self) -> Iterator[sql.Connection]:
|
|
204
|
-
"""
|
|
269
|
+
def begin_xact(self, *, for_write: bool = False) -> Iterator[sql.Connection]:
|
|
270
|
+
"""
|
|
271
|
+
Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
|
|
272
|
+
|
|
273
|
+
for_write: if True, uses serializable isolation; if False, uses repeatable_read
|
|
274
|
+
|
|
275
|
+
TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
|
|
276
|
+
that avoids tripping over any pending ops
|
|
277
|
+
"""
|
|
205
278
|
if self._current_conn is None:
|
|
206
279
|
assert self._current_session is None
|
|
207
280
|
try:
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
281
|
+
self._current_isolation_level = self.SERIALIZABLE_ISOLATION_LEVEL
|
|
282
|
+
with (
|
|
283
|
+
self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
|
|
284
|
+
orm.Session(conn) as session,
|
|
285
|
+
conn.begin(),
|
|
286
|
+
):
|
|
211
287
|
self._current_conn = conn
|
|
212
288
|
self._current_session = session
|
|
213
289
|
yield conn
|
|
214
290
|
finally:
|
|
215
291
|
self._current_session = None
|
|
216
292
|
self._current_conn = None
|
|
217
|
-
|
|
293
|
+
self._current_isolation_level = None
|
|
218
294
|
else:
|
|
219
295
|
assert self._current_session is not None
|
|
296
|
+
assert self._current_isolation_level == self.SERIALIZABLE_ISOLATION_LEVEL or not for_write
|
|
220
297
|
yield self._current_conn
|
|
221
298
|
|
|
222
299
|
def configure_logging(
|
|
223
300
|
self,
|
|
224
301
|
*,
|
|
225
|
-
to_stdout:
|
|
226
|
-
level:
|
|
227
|
-
add:
|
|
228
|
-
remove:
|
|
302
|
+
to_stdout: bool | None = None,
|
|
303
|
+
level: int | None = None,
|
|
304
|
+
add: str | None = None,
|
|
305
|
+
remove: str | None = None,
|
|
229
306
|
) -> None:
|
|
230
307
|
"""Configure logging.
|
|
231
308
|
|
|
@@ -267,7 +344,7 @@ class Env:
|
|
|
267
344
|
def set_log_level(self, level: int) -> None:
|
|
268
345
|
self._default_log_level = level
|
|
269
346
|
|
|
270
|
-
def set_module_log_level(self, module: str, level:
|
|
347
|
+
def set_module_log_level(self, module: str, level: int | None) -> None:
|
|
271
348
|
if level is None:
|
|
272
349
|
self._module_log_level.pop(module, None)
|
|
273
350
|
else:
|
|
@@ -282,6 +359,8 @@ class Env:
|
|
|
282
359
|
# accept log messages from a configured pixeltable module (at any level of the module hierarchy)
|
|
283
360
|
path_parts = list(Path(record.pathname).parts)
|
|
284
361
|
path_parts.reverse()
|
|
362
|
+
if 'pixeltable' not in path_parts:
|
|
363
|
+
return False
|
|
285
364
|
max_idx = path_parts.index('pixeltable')
|
|
286
365
|
for module_name in path_parts[:max_idx]:
|
|
287
366
|
if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
|
|
@@ -292,6 +371,26 @@ class Env:
|
|
|
292
371
|
def console_logger(self) -> ConsoleLogger:
|
|
293
372
|
return self._console_logger
|
|
294
373
|
|
|
374
|
+
def _get_tz_name(self) -> str:
|
|
375
|
+
"""Get the time zone name from the configuration, or the system local time zone if not specified.
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
str: The time zone name.
|
|
379
|
+
"""
|
|
380
|
+
tz_name = Config.get().get_string_value('time_zone')
|
|
381
|
+
if tz_name is not None:
|
|
382
|
+
# Validate tzname
|
|
383
|
+
if not isinstance(tz_name, str):
|
|
384
|
+
self._logger.error('Invalid time zone specified in configuration.')
|
|
385
|
+
else:
|
|
386
|
+
try:
|
|
387
|
+
_ = ZoneInfo(tz_name)
|
|
388
|
+
except ZoneInfoNotFoundError:
|
|
389
|
+
self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
|
|
390
|
+
else:
|
|
391
|
+
tz_name = tzlocal.get_localzone_name()
|
|
392
|
+
return tz_name
|
|
393
|
+
|
|
295
394
|
def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
|
|
296
395
|
if self._initialized:
|
|
297
396
|
return
|
|
@@ -301,22 +400,18 @@ class Env:
|
|
|
301
400
|
config = Config.get()
|
|
302
401
|
|
|
303
402
|
self._initialized = True
|
|
403
|
+
|
|
304
404
|
self._media_dir = Config.get().home / 'media'
|
|
305
405
|
self._file_cache_dir = Config.get().home / 'file_cache'
|
|
306
406
|
self._dataset_cache_dir = Config.get().home / 'dataset_cache'
|
|
307
407
|
self._log_dir = Config.get().home / 'logs'
|
|
308
408
|
self._tmp_dir = Config.get().home / 'tmp'
|
|
309
409
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
self._dataset_cache_dir.mkdir()
|
|
316
|
-
if not self._log_dir.exists():
|
|
317
|
-
self._log_dir.mkdir()
|
|
318
|
-
if not self._tmp_dir.exists():
|
|
319
|
-
self._tmp_dir.mkdir()
|
|
410
|
+
self._media_dir.mkdir(exist_ok=True)
|
|
411
|
+
self._file_cache_dir.mkdir(exist_ok=True)
|
|
412
|
+
self._dataset_cache_dir.mkdir(exist_ok=True)
|
|
413
|
+
self._log_dir.mkdir(exist_ok=True)
|
|
414
|
+
self._tmp_dir.mkdir(exist_ok=True)
|
|
320
415
|
|
|
321
416
|
self._file_cache_size_g = config.get_float_value('file_cache_size_g')
|
|
322
417
|
if self._file_cache_size_g is None:
|
|
@@ -325,6 +420,16 @@ class Env:
|
|
|
325
420
|
f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {Config.get().config_file},\n'
|
|
326
421
|
'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
|
|
327
422
|
)
|
|
423
|
+
|
|
424
|
+
self._default_input_media_dest = config.get_string_value('input_media_dest')
|
|
425
|
+
self._default_output_media_dest = config.get_string_value('output_media_dest')
|
|
426
|
+
for mode, uri in (('input', self._default_input_media_dest), ('output', self._default_output_media_dest)):
|
|
427
|
+
if uri is not None:
|
|
428
|
+
try:
|
|
429
|
+
_ = ObjectPath.parse_object_storage_addr(uri, False)
|
|
430
|
+
except Exception as e:
|
|
431
|
+
raise excs.Error(f'Invalid {mode} media destination URI: {uri}') from e
|
|
432
|
+
|
|
328
433
|
self._pxt_api_key = config.get_string_value('api_key')
|
|
329
434
|
|
|
330
435
|
# Disable spurious warnings
|
|
@@ -334,10 +439,12 @@ class Env:
|
|
|
334
439
|
warnings.simplefilter('ignore', category=UserWarning)
|
|
335
440
|
warnings.simplefilter('ignore', category=FutureWarning)
|
|
336
441
|
|
|
337
|
-
# Set
|
|
338
|
-
|
|
442
|
+
# Set verbosity level for user visible console messages
|
|
443
|
+
self._verbosity = config.get_int_value('verbosity')
|
|
444
|
+
if self._verbosity is None:
|
|
445
|
+
self._verbosity = 1
|
|
339
446
|
stdout_handler = ConsoleOutputHandler(stream=stdout)
|
|
340
|
-
stdout_handler.setLevel(
|
|
447
|
+
stdout_handler.setLevel(map_level(self._verbosity))
|
|
341
448
|
stdout_handler.addFilter(ConsoleMessageFilter())
|
|
342
449
|
self._logger.addHandler(stdout_handler)
|
|
343
450
|
self._console_logger = ConsoleLogger(self._logger)
|
|
@@ -371,6 +478,7 @@ class Env:
|
|
|
371
478
|
http_logger.propagate = False
|
|
372
479
|
|
|
373
480
|
self.clear_tmp_dir()
|
|
481
|
+
tz_name = self._get_tz_name()
|
|
374
482
|
|
|
375
483
|
# configure pixeltable database
|
|
376
484
|
self._init_db(config)
|
|
@@ -380,22 +488,10 @@ class Env:
|
|
|
380
488
|
'Reinitializing pixeltable database is not supported when running in non-local environment'
|
|
381
489
|
)
|
|
382
490
|
|
|
383
|
-
tz_name = config.get_string_value('time_zone')
|
|
384
|
-
if tz_name is not None:
|
|
385
|
-
# Validate tzname
|
|
386
|
-
if not isinstance(tz_name, str):
|
|
387
|
-
self._logger.error('Invalid time zone specified in configuration.')
|
|
388
|
-
else:
|
|
389
|
-
try:
|
|
390
|
-
_ = ZoneInfo(tz_name)
|
|
391
|
-
except ZoneInfoNotFoundError:
|
|
392
|
-
self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
|
|
393
|
-
|
|
394
491
|
if reinit_db and self._store_db_exists():
|
|
395
492
|
self._drop_store_db()
|
|
396
493
|
|
|
397
494
|
create_db = not self._store_db_exists()
|
|
398
|
-
|
|
399
495
|
if create_db:
|
|
400
496
|
self._logger.info(f'creating database at: {self.db_url}')
|
|
401
497
|
self._create_store_db()
|
|
@@ -440,7 +536,7 @@ class Env:
|
|
|
440
536
|
raise excs.Error(error)
|
|
441
537
|
self._logger.info(f'Using database at: {self.db_url}')
|
|
442
538
|
else:
|
|
443
|
-
self._db_name =
|
|
539
|
+
self._db_name = config.get_string_value('db') or 'pixeltable'
|
|
444
540
|
self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(Config.get().home / 'pgdata')))
|
|
445
541
|
# cleanup_mode=None will leave the postgres process running after Python exits
|
|
446
542
|
# cleanup_mode='stop' will terminate the postgres process when Python exits
|
|
@@ -454,30 +550,50 @@ class Env:
|
|
|
454
550
|
assert self._db_url is not None
|
|
455
551
|
assert self._db_name is not None
|
|
456
552
|
|
|
553
|
+
@retry(
|
|
554
|
+
stop=stop_after_attempt(3), # Stop after 3 attempts
|
|
555
|
+
wait=wait_exponential_jitter(initial=0.2, max=1.0, jitter=0.2), # Exponential backoff with jitter
|
|
556
|
+
)
|
|
457
557
|
def _init_metadata(self) -> None:
|
|
458
558
|
"""
|
|
459
559
|
Create pixeltable metadata tables and system metadata.
|
|
460
560
|
This is an idempotent operation.
|
|
561
|
+
|
|
562
|
+
Retry logic handles race conditions when multiple Pixeltable processes
|
|
563
|
+
attempt to initialize metadata tables simultaneously. The first process may succeed
|
|
564
|
+
in creating tables while others encounter database constraints (e.g., "table already exists").
|
|
565
|
+
Exponential backoff with jitter reduces contention between competing processes.
|
|
461
566
|
"""
|
|
462
567
|
assert self._sa_engine is not None
|
|
463
568
|
from pixeltable import metadata
|
|
464
569
|
|
|
570
|
+
self._logger.debug('Creating pixeltable metadata')
|
|
465
571
|
metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
|
|
466
572
|
metadata.create_system_info(self._sa_engine)
|
|
467
573
|
|
|
468
|
-
def _create_engine(self, time_zone_name:
|
|
469
|
-
|
|
574
|
+
def _create_engine(self, time_zone_name: str, echo: bool = False) -> None:
|
|
575
|
+
# Add timezone option to connection string
|
|
576
|
+
updated_url = add_option_to_db_url(self.db_url, f'-c timezone={time_zone_name}')
|
|
577
|
+
|
|
470
578
|
self._sa_engine = sql.create_engine(
|
|
471
|
-
|
|
579
|
+
updated_url, echo=echo, isolation_level=self._dbms.transaction_isolation_level
|
|
472
580
|
)
|
|
473
581
|
|
|
474
582
|
self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
|
|
583
|
+
self._logger.info(f'Engine dialect: {self._sa_engine.dialect.name}')
|
|
584
|
+
self._logger.info(f'Engine driver : {self._sa_engine.dialect.driver}')
|
|
475
585
|
|
|
476
586
|
with self.engine.begin() as conn:
|
|
477
587
|
tz_name = conn.execute(sql.text('SHOW TIME ZONE')).scalar()
|
|
478
588
|
assert isinstance(tz_name, str)
|
|
479
589
|
self._logger.info(f'Database time zone is now: {tz_name}')
|
|
480
590
|
self._default_time_zone = ZoneInfo(tz_name)
|
|
591
|
+
if self.is_using_cockroachdb:
|
|
592
|
+
# This could be set when the database is created, but we set it now
|
|
593
|
+
conn.execute(sql.text('SET null_ordered_last = true;'))
|
|
594
|
+
null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
|
|
595
|
+
assert isinstance(null_ordered_last, str)
|
|
596
|
+
self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
|
|
481
597
|
|
|
482
598
|
def _store_db_exists(self) -> bool:
|
|
483
599
|
assert self._db_name is not None
|
|
@@ -512,6 +628,14 @@ class Env:
|
|
|
512
628
|
finally:
|
|
513
629
|
engine.dispose()
|
|
514
630
|
|
|
631
|
+
def _pgserver_terminate_connections_stmt(self) -> str:
|
|
632
|
+
return f"""
|
|
633
|
+
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
634
|
+
FROM pg_stat_activity
|
|
635
|
+
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
636
|
+
AND pid <> pg_backend_pid()
|
|
637
|
+
"""
|
|
638
|
+
|
|
515
639
|
def _drop_store_db(self) -> None:
|
|
516
640
|
assert self._db_name is not None
|
|
517
641
|
engine = sql.create_engine(self._dbms.default_system_db_url(), future=True, isolation_level='AUTOCOMMIT')
|
|
@@ -520,13 +644,7 @@ class Env:
|
|
|
520
644
|
with engine.begin() as conn:
|
|
521
645
|
# terminate active connections
|
|
522
646
|
if self._db_server is not None:
|
|
523
|
-
|
|
524
|
-
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
525
|
-
FROM pg_stat_activity
|
|
526
|
-
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
527
|
-
AND pid <> pg_backend_pid()
|
|
528
|
-
"""
|
|
529
|
-
conn.execute(sql.text(stmt))
|
|
647
|
+
conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
|
|
530
648
|
# drop db
|
|
531
649
|
stmt = self._dbms.drop_db_stmt(preparer.quote(self._db_name))
|
|
532
650
|
conn.execute(sql.text(stmt))
|
|
@@ -539,12 +657,7 @@ class Env:
|
|
|
539
657
|
metadata.upgrade_md(self._sa_engine)
|
|
540
658
|
|
|
541
659
|
@property
|
|
542
|
-
def pxt_api_key(self) -> str:
|
|
543
|
-
if self._pxt_api_key is None:
|
|
544
|
-
raise excs.Error(
|
|
545
|
-
'No API key is configured. Set the PIXELTABLE_API_KEY environment variable, or add an entry to '
|
|
546
|
-
'config.toml as described here:\nhttps://pixeltable.github.io/pixeltable/config/'
|
|
547
|
-
)
|
|
660
|
+
def pxt_api_key(self) -> str | None:
|
|
548
661
|
return self._pxt_api_key
|
|
549
662
|
|
|
550
663
|
def get_client(self, name: str) -> Any:
|
|
@@ -554,35 +667,51 @@ class Env:
|
|
|
554
667
|
Args:
|
|
555
668
|
- name: The name of the client
|
|
556
669
|
"""
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
670
|
+
# Return the existing client if it has already been constructed
|
|
671
|
+
with _registered_clients_lock:
|
|
672
|
+
cl = _registered_clients[name]
|
|
673
|
+
if cl.client_obj is not None:
|
|
674
|
+
return cl.client_obj # Already initialized
|
|
675
|
+
|
|
676
|
+
# Retrieve parameters required to construct the requested client.
|
|
677
|
+
init_kwargs: dict[str, Any] = {}
|
|
678
|
+
for param in cl.params.values():
|
|
679
|
+
# Determine the type of the parameter for proper config parsing.
|
|
680
|
+
pname = param.name
|
|
681
|
+
t = param.annotation
|
|
682
|
+
# Deference T | None
|
|
683
|
+
if typing.get_origin(t) in (typing.Union, types.UnionType):
|
|
684
|
+
args = typing.get_args(t)
|
|
685
|
+
if args[0] is type(None):
|
|
686
|
+
t = args[1]
|
|
687
|
+
elif args[1] is type(None):
|
|
688
|
+
t = args[0]
|
|
689
|
+
assert isinstance(t, type), t
|
|
690
|
+
arg: Any = Config.get().get_value(pname, t, section=name)
|
|
691
|
+
if arg is not None:
|
|
692
|
+
init_kwargs[pname] = arg
|
|
693
|
+
elif param.default is inspect.Parameter.empty:
|
|
569
694
|
raise excs.Error(
|
|
570
|
-
f'`{name}` client not initialized: parameter `{
|
|
571
|
-
f'To fix this, specify the `{name.upper()}_{
|
|
572
|
-
f'or put `{
|
|
695
|
+
f'`{name}` client not initialized: parameter `{pname}` is not configured.\n'
|
|
696
|
+
f'To fix this, specify the `{name.upper()}_{pname.upper()}` environment variable, '
|
|
697
|
+
f'or put `{pname.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
573
698
|
)
|
|
574
699
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
700
|
+
# Construct the requested client
|
|
701
|
+
with _registered_clients_lock:
|
|
702
|
+
if cl.client_obj is not None:
|
|
703
|
+
return cl.client_obj # Already initialized
|
|
704
|
+
cl.client_obj = cl.init_fn(**init_kwargs)
|
|
705
|
+
self._logger.info(f'Initialized `{name}` client with parameters: {init_kwargs}.')
|
|
706
|
+
return cl.client_obj
|
|
578
707
|
|
|
579
708
|
def _start_web_server(self) -> None:
|
|
580
709
|
"""
|
|
581
710
|
The http server root is the file system root.
|
|
582
711
|
eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
|
|
583
|
-
|
|
584
|
-
This arrangement enables serving
|
|
585
|
-
as well as external
|
|
712
|
+
On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
|
|
713
|
+
This arrangement enables serving objects hosted within _home,
|
|
714
|
+
as well as external objects inserted into pixeltable or produced by pixeltable.
|
|
586
715
|
The port is chosen dynamically to prevent conflicts.
|
|
587
716
|
"""
|
|
588
717
|
# Port 0 means OS picks one for us.
|
|
@@ -600,20 +729,65 @@ class Env:
|
|
|
600
729
|
|
|
601
730
|
def _set_up_runtime(self) -> None:
|
|
602
731
|
"""Check for and start runtime services"""
|
|
732
|
+
register_heif_opener()
|
|
603
733
|
self._start_web_server()
|
|
604
734
|
self.__register_packages()
|
|
605
735
|
|
|
736
|
+
@property
|
|
737
|
+
def default_video_encoder(self) -> str | None:
|
|
738
|
+
if self._default_video_encoder is None:
|
|
739
|
+
self._default_video_encoder = self._determine_default_video_encoder()
|
|
740
|
+
return self._default_video_encoder
|
|
741
|
+
|
|
742
|
+
def _determine_default_video_encoder(self) -> str | None:
|
|
743
|
+
"""
|
|
744
|
+
Returns the first available encoder from a list of candidates.
|
|
745
|
+
|
|
746
|
+
TODO:
|
|
747
|
+
- the user might prefer a hardware-accelerated encoder (eg, h264_nvenc or h264_videotoolbox)
|
|
748
|
+
- allow user override via a config option 'video_encoder'
|
|
749
|
+
"""
|
|
750
|
+
# look for available encoders, in this order
|
|
751
|
+
candidates = [
|
|
752
|
+
'libx264', # GPL, best quality
|
|
753
|
+
'libopenh264', # BSD
|
|
754
|
+
]
|
|
755
|
+
|
|
756
|
+
try:
|
|
757
|
+
# Get list of available encoders
|
|
758
|
+
result = subprocess.run(['ffmpeg', '-encoders'], capture_output=True, text=True, timeout=10, check=True)
|
|
759
|
+
|
|
760
|
+
if result.returncode == 0:
|
|
761
|
+
available_encoders = result.stdout
|
|
762
|
+
for encoder in candidates:
|
|
763
|
+
# ffmpeg -encoders output format: " V..... encoder_name description"
|
|
764
|
+
if f' {encoder} ' in available_encoders:
|
|
765
|
+
_logger.debug(f'Using H.264 encoder: {encoder}')
|
|
766
|
+
return encoder
|
|
767
|
+
except Exception:
|
|
768
|
+
pass
|
|
769
|
+
return None
|
|
770
|
+
|
|
606
771
|
def __register_packages(self) -> None:
|
|
607
772
|
"""Declare optional packages that are utilized by some parts of the code."""
|
|
773
|
+
self.__register_package('accelerate')
|
|
608
774
|
self.__register_package('anthropic')
|
|
775
|
+
self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
|
|
609
776
|
self.__register_package('boto3')
|
|
610
777
|
self.__register_package('datasets')
|
|
778
|
+
self.__register_package('diffusers')
|
|
611
779
|
self.__register_package('fiftyone')
|
|
780
|
+
self.__register_package('twelvelabs')
|
|
781
|
+
self.__register_package('fal_client', library_name='fal-client')
|
|
612
782
|
self.__register_package('fireworks', library_name='fireworks-ai')
|
|
783
|
+
self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
|
|
613
784
|
self.__register_package('google.genai', library_name='google-genai')
|
|
785
|
+
self.__register_package('groq')
|
|
614
786
|
self.__register_package('huggingface_hub', library_name='huggingface-hub')
|
|
615
787
|
self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
|
|
788
|
+
self.__register_package('librosa')
|
|
616
789
|
self.__register_package('llama_cpp', library_name='llama-cpp-python')
|
|
790
|
+
self.__register_package('mcp')
|
|
617
791
|
self.__register_package('mistralai')
|
|
618
792
|
self.__register_package('mistune')
|
|
619
793
|
self.__register_package('ollama')
|
|
@@ -622,8 +796,10 @@ class Env:
|
|
|
622
796
|
self.__register_package('pyarrow')
|
|
623
797
|
self.__register_package('pydantic')
|
|
624
798
|
self.__register_package('replicate')
|
|
799
|
+
self.__register_package('reve')
|
|
625
800
|
self.__register_package('sentencepiece')
|
|
626
801
|
self.__register_package('sentence_transformers', library_name='sentence-transformers')
|
|
802
|
+
self.__register_package('soundfile')
|
|
627
803
|
self.__register_package('spacy')
|
|
628
804
|
self.__register_package('tiktoken')
|
|
629
805
|
self.__register_package('together')
|
|
@@ -631,11 +807,14 @@ class Env:
|
|
|
631
807
|
self.__register_package('torchaudio')
|
|
632
808
|
self.__register_package('torchvision')
|
|
633
809
|
self.__register_package('transformers')
|
|
810
|
+
self.__register_package('voyageai')
|
|
634
811
|
self.__register_package('whisper', library_name='openai-whisper')
|
|
635
812
|
self.__register_package('whisperx')
|
|
636
813
|
self.__register_package('yolox', library_name='pixeltable-yolox')
|
|
814
|
+
self.__register_package('lancedb')
|
|
815
|
+
self.__register_package('scenedetect')
|
|
637
816
|
|
|
638
|
-
def __register_package(self, package_name: str, library_name:
|
|
817
|
+
def __register_package(self, package_name: str, library_name: str | None = None) -> None:
|
|
639
818
|
is_installed: bool
|
|
640
819
|
try:
|
|
641
820
|
is_installed = importlib.util.find_spec(package_name) is not None
|
|
@@ -647,7 +826,11 @@ class Env:
|
|
|
647
826
|
library_name=library_name or package_name, # defaults to package_name unless specified otherwise
|
|
648
827
|
)
|
|
649
828
|
|
|
650
|
-
def
|
|
829
|
+
def require_binary(self, binary_name: str) -> None:
|
|
830
|
+
if not shutil.which(binary_name):
|
|
831
|
+
raise excs.Error(f'{binary_name} is not installed or not in PATH. Please install it to use this feature.')
|
|
832
|
+
|
|
833
|
+
def require_package(self, package_name: str, min_version: list[int] | None = None) -> None:
|
|
651
834
|
"""
|
|
652
835
|
Checks whether the specified optional package is available. If not, raises an exception
|
|
653
836
|
with an error message informing the user how to install it.
|
|
@@ -691,14 +874,8 @@ class Env:
|
|
|
691
874
|
else:
|
|
692
875
|
os.remove(path)
|
|
693
876
|
|
|
694
|
-
def
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
def create_tmp_path(self, extension: str = '') -> Path:
|
|
698
|
-
return self._tmp_dir / f'{uuid.uuid4()}{extension}'
|
|
699
|
-
|
|
700
|
-
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
|
|
701
|
-
def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
|
|
877
|
+
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Type[T] | None) -> T:
|
|
878
|
+
def get_resource_pool_info(self, pool_id: str, make_pool_info: Callable[[], T] | None = None) -> T:
|
|
702
879
|
"""Returns the info object for the given id, creating it if necessary."""
|
|
703
880
|
info = self._resource_pool_info.get(pool_id)
|
|
704
881
|
if info is None and make_pool_info is not None:
|
|
@@ -711,6 +888,14 @@ class Env:
|
|
|
711
888
|
assert self._media_dir is not None
|
|
712
889
|
return self._media_dir
|
|
713
890
|
|
|
891
|
+
@property
|
|
892
|
+
def default_input_media_dest(self) -> str | None:
|
|
893
|
+
return self._default_input_media_dest
|
|
894
|
+
|
|
895
|
+
@property
|
|
896
|
+
def default_output_media_dest(self) -> str | None:
|
|
897
|
+
return self._default_output_media_dest
|
|
898
|
+
|
|
714
899
|
@property
|
|
715
900
|
def file_cache_dir(self) -> Path:
|
|
716
901
|
assert self._file_cache_dir is not None
|
|
@@ -746,24 +931,74 @@ class Env:
|
|
|
746
931
|
have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
|
|
747
932
|
"""
|
|
748
933
|
import spacy
|
|
749
|
-
from spacy.cli.download import
|
|
934
|
+
from spacy.cli.download import download
|
|
750
935
|
|
|
751
936
|
spacy_model = 'en_core_web_sm'
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
url = f'{spacy.about.__download_url__}/{filename}'
|
|
755
|
-
# Try to `pip install` the model. We set check=False; if the pip command fails, it's not necessarily
|
|
756
|
-
# a problem, because the model might have been installed on a previous attempt.
|
|
757
|
-
self._logger.info(f'Ensuring spaCy model is installed: {filename}')
|
|
758
|
-
ret = subprocess.run([sys.executable, '-m', 'pip', 'install', '-qU', url], check=False)
|
|
759
|
-
if ret.returncode != 0:
|
|
760
|
-
self._logger.warning(f'pip install failed for spaCy model: {filename}')
|
|
937
|
+
self._logger.info(f'Ensuring spaCy model is installed: {spacy_model}')
|
|
938
|
+
download(spacy_model)
|
|
761
939
|
self._logger.info(f'Loading spaCy model: {spacy_model}')
|
|
762
940
|
try:
|
|
763
941
|
self._spacy_nlp = spacy.load(spacy_model)
|
|
764
942
|
except Exception as exc:
|
|
765
943
|
raise excs.Error(f'Failed to load spaCy model: {spacy_model}') from exc
|
|
766
944
|
|
|
945
|
+
def _clean_up(self) -> None:
|
|
946
|
+
"""
|
|
947
|
+
Internal cleanup method that properly closes all resources and resets state.
|
|
948
|
+
This is called before destroying the singleton instance.
|
|
949
|
+
"""
|
|
950
|
+
assert self._current_session is None
|
|
951
|
+
assert self._current_conn is None
|
|
952
|
+
|
|
953
|
+
# Stop HTTP server
|
|
954
|
+
if self._httpd is not None:
|
|
955
|
+
try:
|
|
956
|
+
self._httpd.shutdown()
|
|
957
|
+
self._httpd.server_close()
|
|
958
|
+
except Exception as e:
|
|
959
|
+
_logger.warning(f'Error stopping HTTP server: {e}')
|
|
960
|
+
|
|
961
|
+
# First terminate all connections to the database
|
|
962
|
+
if self._db_server is not None:
|
|
963
|
+
assert self._dbms is not None
|
|
964
|
+
assert self._db_name is not None
|
|
965
|
+
try:
|
|
966
|
+
temp_engine = sql.create_engine(self._dbms.default_system_db_url(), isolation_level='AUTOCOMMIT')
|
|
967
|
+
try:
|
|
968
|
+
with temp_engine.begin() as conn:
|
|
969
|
+
conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
|
|
970
|
+
_logger.info(f"Terminated all connections to database '{self._db_name}'")
|
|
971
|
+
except Exception as e:
|
|
972
|
+
_logger.warning(f'Error terminating database connections: {e}')
|
|
973
|
+
finally:
|
|
974
|
+
temp_engine.dispose()
|
|
975
|
+
except Exception as e:
|
|
976
|
+
_logger.warning(f'Error stopping database server: {e}')
|
|
977
|
+
|
|
978
|
+
# Dispose of SQLAlchemy engine (after stopping db server)
|
|
979
|
+
if self._sa_engine is not None:
|
|
980
|
+
try:
|
|
981
|
+
self._sa_engine.dispose()
|
|
982
|
+
except Exception as e:
|
|
983
|
+
_logger.warning(f'Error disposing engine: {e}')
|
|
984
|
+
|
|
985
|
+
# Close event loop
|
|
986
|
+
if self._event_loop is not None:
|
|
987
|
+
try:
|
|
988
|
+
if self._event_loop.is_running():
|
|
989
|
+
self._event_loop.stop()
|
|
990
|
+
self._event_loop.close()
|
|
991
|
+
except Exception as e:
|
|
992
|
+
_logger.warning(f'Error closing event loop: {e}')
|
|
993
|
+
|
|
994
|
+
# Remove logging handlers
|
|
995
|
+
for handler in self._logger.handlers[:]:
|
|
996
|
+
try:
|
|
997
|
+
handler.close()
|
|
998
|
+
self._logger.removeHandler(handler)
|
|
999
|
+
except Exception as e:
|
|
1000
|
+
_logger.warning(f'Error removing handler: {e}')
|
|
1001
|
+
|
|
767
1002
|
|
|
768
1003
|
def register_client(name: str) -> Callable:
|
|
769
1004
|
"""Decorator that registers a third-party API client for use by Pixeltable.
|
|
@@ -792,30 +1027,35 @@ def register_client(name: str) -> Callable:
|
|
|
792
1027
|
|
|
793
1028
|
def decorator(fn: Callable) -> None:
|
|
794
1029
|
sig = inspect.signature(fn)
|
|
795
|
-
|
|
796
|
-
|
|
1030
|
+
params = dict(sig.parameters)
|
|
1031
|
+
with _registered_clients_lock:
|
|
1032
|
+
_registered_clients[name] = ApiClient(init_fn=fn, params=params)
|
|
797
1033
|
|
|
798
1034
|
return decorator
|
|
799
1035
|
|
|
800
1036
|
|
|
1037
|
+
_registered_clients_lock: threading.Lock = threading.Lock()
|
|
801
1038
|
_registered_clients: dict[str, ApiClient] = {}
|
|
802
1039
|
|
|
803
1040
|
|
|
804
1041
|
@dataclass
|
|
805
1042
|
class ApiClient:
|
|
806
1043
|
init_fn: Callable
|
|
807
|
-
|
|
808
|
-
client_obj:
|
|
1044
|
+
params: dict[str, inspect.Parameter]
|
|
1045
|
+
client_obj: Any | None = None
|
|
809
1046
|
|
|
810
1047
|
|
|
811
1048
|
@dataclass
|
|
812
1049
|
class PackageInfo:
|
|
813
1050
|
is_installed: bool
|
|
814
1051
|
library_name: str # pypi library name (may be different from package name)
|
|
815
|
-
version:
|
|
1052
|
+
version: list[int] | None = None # installed version, as a list of components (such as [3,0,2] for "3.0.2")
|
|
816
1053
|
|
|
817
1054
|
|
|
818
1055
|
TIME_FORMAT = '%H:%M.%S %f'
|
|
1056
|
+
# As far as rate limiting goes, we try not go lower than 5% of the capacity because we don't have perfect information
|
|
1057
|
+
# about the rate limits and the usage
|
|
1058
|
+
TARGET_RATE_LIMIT_RESOURCE_FRACT = 0.05
|
|
819
1059
|
|
|
820
1060
|
|
|
821
1061
|
@dataclass
|
|
@@ -838,6 +1078,10 @@ class RateLimitsInfo:
|
|
|
838
1078
|
get_request_resources: Callable[..., dict[str, int]]
|
|
839
1079
|
|
|
840
1080
|
resource_limits: dict[str, RateLimitInfo] = field(default_factory=dict)
|
|
1081
|
+
has_exc: bool = False
|
|
1082
|
+
|
|
1083
|
+
def debug_str(self) -> str:
|
|
1084
|
+
return ','.join(info.debug_str() for info in self.resource_limits.values())
|
|
841
1085
|
|
|
842
1086
|
def is_initialized(self) -> bool:
|
|
843
1087
|
return len(self.resource_limits) > 0
|
|
@@ -845,25 +1089,46 @@ class RateLimitsInfo:
|
|
|
845
1089
|
def reset(self) -> None:
|
|
846
1090
|
self.resource_limits.clear()
|
|
847
1091
|
|
|
848
|
-
def record(self, **kwargs: Any) -> None:
|
|
849
|
-
|
|
1092
|
+
def record(self, request_ts: datetime.datetime, reset_exc: bool = False, **kwargs: Any) -> None:
|
|
1093
|
+
"""Update self.resource_limits with the provided rate limit info.
|
|
1094
|
+
Args:
|
|
1095
|
+
- request_ts: time at which the request was made
|
|
1096
|
+
- reset_exc: if True, reset the has_exc flag
|
|
1097
|
+
"""
|
|
850
1098
|
if len(self.resource_limits) == 0:
|
|
851
|
-
self.resource_limits = {k: RateLimitInfo(k,
|
|
1099
|
+
self.resource_limits = {k: RateLimitInfo(k, request_ts, *v) for k, v in kwargs.items() if v is not None}
|
|
852
1100
|
# TODO: remove
|
|
853
1101
|
for info in self.resource_limits.values():
|
|
854
|
-
_logger.debug(
|
|
855
|
-
f'Init {info.resource} rate limit: rem={info.remaining} '
|
|
856
|
-
f'reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
|
|
857
|
-
)
|
|
1102
|
+
_logger.debug(f'Updated resource state: {info}')
|
|
858
1103
|
else:
|
|
1104
|
+
if self.has_exc and not reset_exc:
|
|
1105
|
+
# ignore updates until we're asked to reset
|
|
1106
|
+
_logger.debug(f'rate_limits.record(): ignoring update {kwargs}')
|
|
1107
|
+
return
|
|
1108
|
+
self.has_exc = False
|
|
859
1109
|
for k, v in kwargs.items():
|
|
860
1110
|
if v is not None:
|
|
861
|
-
self.resource_limits[k].update(
|
|
1111
|
+
self.resource_limits[k].update(request_ts, *v)
|
|
1112
|
+
_logger.debug(f'Updated resource state: {self.resource_limits[k]}')
|
|
862
1113
|
|
|
863
|
-
|
|
864
|
-
|
|
1114
|
+
def record_exc(self, request_ts: datetime.datetime, exc: Exception) -> None:
|
|
1115
|
+
"""Update self.resource_limits based on the exception headers
|
|
1116
|
+
Args:
|
|
1117
|
+
- request_ts: time at which the request that caused the exception was made
|
|
1118
|
+
- exc: the exception raised"""
|
|
1119
|
+
self.has_exc = True
|
|
1120
|
+
|
|
1121
|
+
def get_retry_delay(self, exc: Exception, attempt: int) -> float | None:
|
|
865
1122
|
"""Returns number of seconds to wait before retry, or None if not retryable"""
|
|
866
|
-
|
|
1123
|
+
# Find the highest wait until at least 5% availability of all resources
|
|
1124
|
+
max_wait = 0.0
|
|
1125
|
+
for limit_info in self.resource_limits.values():
|
|
1126
|
+
time_until = limit_info.estimated_resource_refill_delay(
|
|
1127
|
+
math.ceil(TARGET_RATE_LIMIT_RESOURCE_FRACT * limit_info.limit)
|
|
1128
|
+
)
|
|
1129
|
+
if time_until is not None:
|
|
1130
|
+
max_wait = max(max_wait, time_until)
|
|
1131
|
+
return max_wait if max_wait > 0 else None
|
|
867
1132
|
|
|
868
1133
|
|
|
869
1134
|
@dataclass
|
|
@@ -871,22 +1136,71 @@ class RateLimitInfo:
|
|
|
871
1136
|
"""Container for rate limit-related information for a single resource."""
|
|
872
1137
|
|
|
873
1138
|
resource: str
|
|
874
|
-
|
|
1139
|
+
request_start_ts: datetime.datetime
|
|
875
1140
|
limit: int
|
|
876
1141
|
remaining: int
|
|
877
1142
|
reset_at: datetime.datetime
|
|
878
1143
|
|
|
879
|
-
def
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
1144
|
+
def debug_str(self) -> str:
|
|
1145
|
+
return (
|
|
1146
|
+
f'{self.resource}@{self.request_start_ts.strftime(TIME_FORMAT)}: '
|
|
1147
|
+
f'{self.limit}/{self.remaining}/{self.reset_at.strftime(TIME_FORMAT)}'
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
def update(
|
|
1151
|
+
self, request_start_ts: datetime.datetime, limit: int, remaining: int, reset_at: datetime.datetime
|
|
1152
|
+
) -> None:
|
|
1153
|
+
# Responses can come out of order, especially for failed requests. We need to be careful not to overwrite
|
|
1154
|
+
# the current state with less up-to-date information. We use request_start_ts as a proxy for rate limit info
|
|
1155
|
+
# recency.
|
|
1156
|
+
if self.request_start_ts > request_start_ts:
|
|
1157
|
+
# The current state is more up-to-date than the update
|
|
1158
|
+
_logger.debug(
|
|
1159
|
+
f'Ignoring out-of-date update for {self.resource}. Current request_start_ts: '
|
|
1160
|
+
f'{self.request_start_ts}, update: {request_start_ts}'
|
|
1161
|
+
)
|
|
1162
|
+
return
|
|
1163
|
+
self.request_start_ts = request_start_ts
|
|
884
1164
|
self.limit = limit
|
|
885
1165
|
self.remaining = remaining
|
|
886
|
-
reset_delta = reset_at - self.reset_at
|
|
887
1166
|
self.reset_at = reset_at
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
1167
|
+
|
|
1168
|
+
def estimated_resource_refill_delay(self, target_remaining: int) -> float | None:
|
|
1169
|
+
"""Estimate time in seconds until remaining resources reaches target_remaining.
|
|
1170
|
+
Assumes linear replenishment of resources over time.
|
|
1171
|
+
Returns None if unable to estimate.
|
|
1172
|
+
"""
|
|
1173
|
+
if self.remaining >= target_remaining:
|
|
1174
|
+
return 0
|
|
1175
|
+
if self.request_start_ts >= self.reset_at:
|
|
1176
|
+
return 0
|
|
1177
|
+
if self.limit < target_remaining:
|
|
1178
|
+
return None
|
|
1179
|
+
|
|
1180
|
+
# Estimate resource refill rate based on the recorded state and timestamps. Assumes linear refill.
|
|
1181
|
+
refill_rate = (self.limit - self.remaining) / (self.reset_at - self.request_start_ts).total_seconds()
|
|
1182
|
+
assert refill_rate > 0, f'self={self}, target_remaining={target_remaining}'
|
|
1183
|
+
|
|
1184
|
+
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
1185
|
+
time_until = (target_remaining - self.remaining) / refill_rate - (now - self.request_start_ts).total_seconds()
|
|
1186
|
+
return max(0, math.ceil(time_until))
|
|
1187
|
+
|
|
1188
|
+
def __repr__(self) -> str:
|
|
1189
|
+
return (
|
|
1190
|
+
f'RateLimitInfo(resource={self.resource}, request_start_ts={self.request_start_ts}, '
|
|
1191
|
+
f'remaining={self.remaining}/{self.limit} ({(100 * self.remaining / self.limit):.1f}%), '
|
|
1192
|
+
f'reset_at={self.reset_at})'
|
|
892
1193
|
)
|
|
1194
|
+
|
|
1195
|
+
|
|
1196
|
+
@dataclass
|
|
1197
|
+
class RuntimeCtx:
|
|
1198
|
+
"""
|
|
1199
|
+
Container for runtime data provided by the execution system to udfs.
|
|
1200
|
+
|
|
1201
|
+
Udfs that accept the special _runtime_ctx parameter receive an instance of this class.
|
|
1202
|
+
"""
|
|
1203
|
+
|
|
1204
|
+
# Indicates a retry attempt following a rate limit error (error code: 429). Requires a 'rate-limits' resource pool.
|
|
1205
|
+
# If True, call RateLimitsInfo.record() with reset_exc=True.
|
|
1206
|
+
is_retry: bool = False
|