pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +83 -19
- pixeltable/_query.py +1444 -0
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +7 -4
- pixeltable/catalog/catalog.py +2394 -119
- pixeltable/catalog/column.py +225 -104
- pixeltable/catalog/dir.py +38 -9
- pixeltable/catalog/globals.py +53 -34
- pixeltable/catalog/insertable_table.py +265 -115
- pixeltable/catalog/path.py +80 -17
- pixeltable/catalog/schema_object.py +28 -43
- pixeltable/catalog/table.py +1270 -677
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +1270 -751
- pixeltable/catalog/table_version_handle.py +109 -0
- pixeltable/catalog/table_version_path.py +137 -42
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +251 -134
- pixeltable/config.py +215 -0
- pixeltable/env.py +736 -285
- pixeltable/exceptions.py +26 -2
- pixeltable/exec/__init__.py +7 -2
- pixeltable/exec/aggregation_node.py +39 -21
- pixeltable/exec/cache_prefetch_node.py +87 -109
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +25 -28
- pixeltable/exec/data_row_batch.py +11 -46
- pixeltable/exec/exec_context.py +26 -11
- pixeltable/exec/exec_node.py +35 -27
- pixeltable/exec/expr_eval/__init__.py +3 -0
- pixeltable/exec/expr_eval/evaluators.py +365 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
- pixeltable/exec/expr_eval/globals.py +200 -0
- pixeltable/exec/expr_eval/row_buffer.py +74 -0
- pixeltable/exec/expr_eval/schedulers.py +413 -0
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +35 -27
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +44 -29
- pixeltable/exec/sql_node.py +414 -115
- pixeltable/exprs/__init__.py +8 -5
- pixeltable/exprs/arithmetic_expr.py +79 -45
- pixeltable/exprs/array_slice.py +5 -5
- pixeltable/exprs/column_property_ref.py +40 -26
- pixeltable/exprs/column_ref.py +254 -61
- pixeltable/exprs/comparison.py +14 -9
- pixeltable/exprs/compound_predicate.py +9 -10
- pixeltable/exprs/data_row.py +213 -72
- pixeltable/exprs/expr.py +270 -104
- pixeltable/exprs/expr_dict.py +6 -5
- pixeltable/exprs/expr_set.py +20 -11
- pixeltable/exprs/function_call.py +383 -284
- pixeltable/exprs/globals.py +18 -5
- pixeltable/exprs/in_predicate.py +7 -7
- pixeltable/exprs/inline_expr.py +37 -37
- pixeltable/exprs/is_null.py +8 -4
- pixeltable/exprs/json_mapper.py +120 -54
- pixeltable/exprs/json_path.py +90 -60
- pixeltable/exprs/literal.py +61 -16
- pixeltable/exprs/method_ref.py +7 -6
- pixeltable/exprs/object_ref.py +19 -8
- pixeltable/exprs/row_builder.py +238 -75
- pixeltable/exprs/rowid_ref.py +53 -15
- pixeltable/exprs/similarity_expr.py +65 -50
- pixeltable/exprs/sql_element_cache.py +5 -5
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/exprs/type_cast.py +25 -13
- pixeltable/exprs/variable.py +2 -2
- pixeltable/func/__init__.py +9 -5
- pixeltable/func/aggregate_function.py +197 -92
- pixeltable/func/callable_function.py +119 -35
- pixeltable/func/expr_template_function.py +101 -48
- pixeltable/func/function.py +375 -62
- pixeltable/func/function_registry.py +20 -19
- pixeltable/func/globals.py +6 -5
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +151 -35
- pixeltable/func/signature.py +178 -49
- pixeltable/func/tools.py +164 -0
- pixeltable/func/udf.py +176 -53
- pixeltable/functions/__init__.py +44 -4
- pixeltable/functions/anthropic.py +226 -47
- pixeltable/functions/audio.py +148 -11
- pixeltable/functions/bedrock.py +137 -0
- pixeltable/functions/date.py +188 -0
- pixeltable/functions/deepseek.py +113 -0
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +72 -20
- pixeltable/functions/gemini.py +249 -0
- pixeltable/functions/globals.py +208 -53
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1088 -95
- pixeltable/functions/image.py +155 -84
- pixeltable/functions/json.py +8 -11
- pixeltable/functions/llama_cpp.py +31 -19
- pixeltable/functions/math.py +169 -0
- pixeltable/functions/mistralai.py +50 -75
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +29 -36
- pixeltable/functions/openai.py +548 -160
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +15 -14
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +310 -85
- pixeltable/functions/timestamp.py +37 -19
- pixeltable/functions/together.py +77 -120
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +7 -2
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1528 -117
- pixeltable/functions/vision.py +26 -26
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +19 -10
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/functions/yolox.py +112 -0
- pixeltable/globals.py +716 -236
- pixeltable/index/__init__.py +3 -1
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +32 -22
- pixeltable/index/embedding_index.py +155 -92
- pixeltable/io/__init__.py +12 -7
- pixeltable/io/datarows.py +140 -0
- pixeltable/io/external_store.py +83 -125
- pixeltable/io/fiftyone.py +24 -33
- pixeltable/io/globals.py +47 -182
- pixeltable/io/hf_datasets.py +96 -127
- pixeltable/io/label_studio.py +171 -156
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +136 -115
- pixeltable/io/parquet.py +40 -153
- pixeltable/io/table_data_conduit.py +702 -0
- pixeltable/io/utils.py +100 -0
- pixeltable/iterators/__init__.py +8 -4
- pixeltable/iterators/audio.py +207 -0
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +144 -87
- pixeltable/iterators/image.py +17 -38
- pixeltable/iterators/string.py +15 -12
- pixeltable/iterators/video.py +523 -127
- pixeltable/metadata/__init__.py +33 -8
- pixeltable/metadata/converters/convert_10.py +2 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_15.py +15 -11
- pixeltable/metadata/converters/convert_16.py +4 -5
- pixeltable/metadata/converters/convert_17.py +4 -5
- pixeltable/metadata/converters/convert_18.py +4 -6
- pixeltable/metadata/converters/convert_19.py +6 -9
- pixeltable/metadata/converters/convert_20.py +3 -6
- pixeltable/metadata/converters/convert_21.py +6 -8
- pixeltable/metadata/converters/convert_22.py +3 -2
- pixeltable/metadata/converters/convert_23.py +33 -0
- pixeltable/metadata/converters/convert_24.py +55 -0
- pixeltable/metadata/converters/convert_25.py +19 -0
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/convert_27.py +29 -0
- pixeltable/metadata/converters/convert_28.py +13 -0
- pixeltable/metadata/converters/convert_29.py +110 -0
- pixeltable/metadata/converters/convert_30.py +63 -0
- pixeltable/metadata/converters/convert_31.py +11 -0
- pixeltable/metadata/converters/convert_32.py +15 -0
- pixeltable/metadata/converters/convert_33.py +17 -0
- pixeltable/metadata/converters/convert_34.py +21 -0
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +44 -18
- pixeltable/metadata/notes.py +21 -0
- pixeltable/metadata/schema.py +185 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +616 -225
- pixeltable/share/__init__.py +3 -0
- pixeltable/share/packager.py +797 -0
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +349 -0
- pixeltable/store.py +398 -232
- pixeltable/type_system.py +730 -267
- pixeltable/utils/__init__.py +40 -0
- pixeltable/utils/arrow.py +201 -29
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +26 -27
- pixeltable/utils/code.py +4 -4
- pixeltable/utils/console_output.py +46 -0
- pixeltable/utils/coroutine.py +24 -0
- pixeltable/utils/dbms.py +92 -0
- pixeltable/utils/description_helper.py +11 -12
- pixeltable/utils/documents.py +60 -61
- pixeltable/utils/exception_handler.py +36 -0
- pixeltable/utils/filecache.py +38 -22
- pixeltable/utils/formatter.py +88 -51
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +14 -13
- pixeltable/utils/iceberg.py +13 -0
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +20 -20
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +32 -5
- pixeltable/utils/system.py +30 -0
- pixeltable/utils/transactional_directory.py +4 -3
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -36
- pixeltable/catalog/path_dict.py +0 -141
- pixeltable/dataframe.py +0 -894
- pixeltable/exec/expr_eval_node.py +0 -232
- pixeltable/ext/__init__.py +0 -14
- pixeltable/ext/functions/__init__.py +0 -8
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/ext/functions/yolox.py +0 -157
- pixeltable/tool/create_test_db_dump.py +0 -311
- pixeltable/tool/create_test_video.py +0 -81
- pixeltable/tool/doc_plugins/griffe.py +0 -50
- pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
- pixeltable/tool/embed_udf.py +0 -9
- pixeltable/tool/mypy_plugin.py +0 -55
- pixeltable/utils/media_store.py +0 -76
- pixeltable/utils/s3.py +0 -16
- pixeltable-0.2.26.dist-info/METADATA +0 -400
- pixeltable-0.2.26.dist-info/RECORD +0 -156
- pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/env.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import datetime
|
|
4
5
|
import glob
|
|
5
6
|
import http.server
|
|
@@ -7,68 +8,101 @@ import importlib
|
|
|
7
8
|
import importlib.util
|
|
8
9
|
import inspect
|
|
9
10
|
import logging
|
|
11
|
+
import math
|
|
10
12
|
import os
|
|
13
|
+
import platform
|
|
11
14
|
import shutil
|
|
12
15
|
import subprocess
|
|
13
16
|
import sys
|
|
14
17
|
import threading
|
|
15
|
-
import
|
|
18
|
+
import types
|
|
19
|
+
import typing
|
|
16
20
|
import warnings
|
|
17
|
-
from
|
|
21
|
+
from contextlib import contextmanager
|
|
22
|
+
from dataclasses import dataclass, field
|
|
18
23
|
from pathlib import Path
|
|
19
|
-
from
|
|
24
|
+
from sys import stdout
|
|
25
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, TypeVar
|
|
20
26
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
21
27
|
|
|
28
|
+
import nest_asyncio # type: ignore[import-untyped]
|
|
22
29
|
import pixeltable_pgserver
|
|
23
30
|
import sqlalchemy as sql
|
|
24
|
-
import
|
|
31
|
+
import tzlocal
|
|
32
|
+
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
33
|
+
from sqlalchemy import orm
|
|
34
|
+
from tenacity import retry, stop_after_attempt, wait_exponential_jitter
|
|
25
35
|
from tqdm import TqdmWarning
|
|
26
36
|
|
|
27
|
-
import
|
|
28
|
-
from pixeltable import
|
|
37
|
+
from pixeltable import exceptions as excs
|
|
38
|
+
from pixeltable.config import Config
|
|
39
|
+
from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
|
|
40
|
+
from pixeltable.utils.dbms import CockroachDbms, Dbms, PostgresqlDbms
|
|
29
41
|
from pixeltable.utils.http_server import make_server
|
|
42
|
+
from pixeltable.utils.object_stores import ObjectPath
|
|
43
|
+
from pixeltable.utils.sql import add_option_to_db_url
|
|
30
44
|
|
|
31
45
|
if TYPE_CHECKING:
|
|
32
46
|
import spacy
|
|
33
47
|
|
|
34
48
|
|
|
49
|
+
_logger = logging.getLogger('pixeltable')
|
|
50
|
+
|
|
51
|
+
T = TypeVar('T')
|
|
52
|
+
|
|
53
|
+
|
|
35
54
|
class Env:
|
|
36
55
|
"""
|
|
37
|
-
Store for
|
|
56
|
+
Store runtime globals for both local and non-local environments.
|
|
57
|
+
For a local environment, Pixeltable uses an embedded PostgreSQL server that runs locally in a separate process.
|
|
58
|
+
For a non-local environment, Pixeltable uses a connection string to the externally managed database.
|
|
38
59
|
"""
|
|
39
60
|
|
|
40
|
-
|
|
61
|
+
SERIALIZABLE_ISOLATION_LEVEL = 'SERIALIZABLE'
|
|
62
|
+
|
|
63
|
+
_instance: Env | None = None
|
|
64
|
+
__initializing: bool = False
|
|
41
65
|
_log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
|
|
42
66
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
67
|
+
_media_dir: Path | None
|
|
68
|
+
_file_cache_dir: Path | None # cached object files with external URL
|
|
69
|
+
_dataset_cache_dir: Path | None # cached datasets (eg, pytorch or COCO)
|
|
70
|
+
_log_dir: Path | None # log files
|
|
71
|
+
_tmp_dir: Path | None # any tmp files
|
|
72
|
+
_sa_engine: sql.engine.base.Engine | None
|
|
73
|
+
_pgdata_dir: Path | None
|
|
74
|
+
_db_name: str | None
|
|
75
|
+
_db_server: pixeltable_pgserver.PostgresServer | None # set only when running in local environment
|
|
76
|
+
_db_url: str | None
|
|
77
|
+
_default_time_zone: ZoneInfo | None
|
|
78
|
+
_verbosity: int
|
|
55
79
|
|
|
56
80
|
# info about optional packages that are utilized by some parts of the code
|
|
57
81
|
__optional_packages: dict[str, PackageInfo]
|
|
58
82
|
|
|
59
|
-
_spacy_nlp:
|
|
60
|
-
_httpd:
|
|
61
|
-
_http_address:
|
|
83
|
+
_spacy_nlp: spacy.Language | None
|
|
84
|
+
_httpd: http.server.HTTPServer | None
|
|
85
|
+
_http_address: str | None
|
|
62
86
|
_logger: logging.Logger
|
|
63
87
|
_default_log_level: int
|
|
64
|
-
_logfilename:
|
|
88
|
+
_logfilename: str | None
|
|
65
89
|
_log_to_stdout: bool
|
|
66
90
|
_module_log_level: dict[str, int] # module name -> log level
|
|
67
|
-
|
|
68
|
-
|
|
91
|
+
_file_cache_size_g: float
|
|
92
|
+
_default_input_media_dest: str | None
|
|
93
|
+
_default_output_media_dest: str | None
|
|
94
|
+
_pxt_api_key: str | None
|
|
69
95
|
_stdout_handler: logging.StreamHandler
|
|
96
|
+
_default_video_encoder: str | None
|
|
70
97
|
_initialized: bool
|
|
71
98
|
|
|
99
|
+
_resource_pool_info: dict[str, Any]
|
|
100
|
+
_current_conn: sql.Connection | None
|
|
101
|
+
_current_session: orm.Session | None
|
|
102
|
+
_current_isolation_level: str | None
|
|
103
|
+
_dbms: Dbms | None
|
|
104
|
+
_event_loop: asyncio.AbstractEventLoop | None # event loop for ExecNode
|
|
105
|
+
|
|
72
106
|
@classmethod
|
|
73
107
|
def get(cls) -> Env:
|
|
74
108
|
if cls._instance is None:
|
|
@@ -77,15 +111,26 @@ class Env:
|
|
|
77
111
|
|
|
78
112
|
@classmethod
|
|
79
113
|
def _init_env(cls, reinit_db: bool = False) -> None:
|
|
114
|
+
assert not cls.__initializing, 'Circular env initialization detected.'
|
|
115
|
+
cls.__initializing = True
|
|
116
|
+
if cls._instance is not None:
|
|
117
|
+
cls._instance._clean_up()
|
|
118
|
+
cls._instance = None
|
|
80
119
|
env = Env()
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
120
|
+
try:
|
|
121
|
+
env._set_up(reinit_db=reinit_db)
|
|
122
|
+
env._upgrade_metadata()
|
|
123
|
+
cls._instance = env
|
|
124
|
+
finally:
|
|
125
|
+
# Reset the initializing flag, even if setup fails.
|
|
126
|
+
# This prevents the environment from being left in a broken state.
|
|
127
|
+
cls.__initializing = False
|
|
128
|
+
|
|
129
|
+
def __init__(self) -> None:
|
|
130
|
+
assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
|
|
84
131
|
|
|
85
|
-
def __init__(self):
|
|
86
|
-
self._home = None
|
|
87
132
|
self._media_dir = None # computed media files
|
|
88
|
-
self._file_cache_dir = None # cached
|
|
133
|
+
self._file_cache_dir = None # cached object files with external URL
|
|
89
134
|
self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
|
|
90
135
|
self._log_dir = None # log files
|
|
91
136
|
self._tmp_dir = None # any tmp files
|
|
@@ -95,11 +140,11 @@ class Env:
|
|
|
95
140
|
self._db_server = None
|
|
96
141
|
self._db_url = None
|
|
97
142
|
self._default_time_zone = None
|
|
98
|
-
|
|
99
143
|
self.__optional_packages = {}
|
|
100
144
|
self._spacy_nlp = None
|
|
101
145
|
self._httpd = None
|
|
102
146
|
self._http_address = None
|
|
147
|
+
self._default_video_encoder = None
|
|
103
148
|
|
|
104
149
|
# logging-related state
|
|
105
150
|
self._logger = logging.getLogger('pixeltable')
|
|
@@ -111,19 +156,42 @@ class Env:
|
|
|
111
156
|
self._log_to_stdout = False
|
|
112
157
|
self._module_log_level = {} # module name -> log level
|
|
113
158
|
|
|
114
|
-
# config
|
|
115
|
-
self._config_file = None
|
|
116
|
-
self._config = None
|
|
117
|
-
|
|
118
159
|
# create logging handler to also log to stdout
|
|
119
160
|
self._stdout_handler = logging.StreamHandler(stream=sys.stdout)
|
|
120
161
|
self._stdout_handler.setFormatter(logging.Formatter(self._log_fmt_str))
|
|
121
162
|
self._initialized = False
|
|
122
163
|
|
|
164
|
+
self._resource_pool_info = {}
|
|
165
|
+
self._current_conn = None
|
|
166
|
+
self._current_session = None
|
|
167
|
+
self._current_isolation_level = None
|
|
168
|
+
self._dbms = None
|
|
169
|
+
self._event_loop = None
|
|
170
|
+
|
|
171
|
+
def _init_event_loop(self) -> None:
|
|
172
|
+
try:
|
|
173
|
+
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
174
|
+
# multiple run_until_complete()
|
|
175
|
+
running_loop = asyncio.get_running_loop()
|
|
176
|
+
self._event_loop = running_loop
|
|
177
|
+
_logger.debug('Patched running loop')
|
|
178
|
+
except RuntimeError:
|
|
179
|
+
self._event_loop = asyncio.new_event_loop()
|
|
180
|
+
asyncio.set_event_loop(self._event_loop)
|
|
181
|
+
# we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
|
|
182
|
+
self._event_loop.slow_callback_duration = 3600
|
|
183
|
+
|
|
184
|
+
# always allow nested event loops, we need that to run async udfs synchronously (eg, for SimilarityExpr);
|
|
185
|
+
# see run_coroutine_synchronously()
|
|
186
|
+
nest_asyncio.apply()
|
|
187
|
+
if _logger.isEnabledFor(logging.DEBUG):
|
|
188
|
+
self._event_loop.set_debug(True)
|
|
189
|
+
|
|
123
190
|
@property
|
|
124
|
-
def
|
|
125
|
-
|
|
126
|
-
|
|
191
|
+
def event_loop(self) -> asyncio.AbstractEventLoop:
|
|
192
|
+
if self._event_loop is None:
|
|
193
|
+
self._init_event_loop()
|
|
194
|
+
return self._event_loop
|
|
127
195
|
|
|
128
196
|
@property
|
|
129
197
|
def db_url(self) -> str:
|
|
@@ -136,25 +204,105 @@ class Env:
|
|
|
136
204
|
return self._http_address
|
|
137
205
|
|
|
138
206
|
@property
|
|
139
|
-
def
|
|
207
|
+
def user(self) -> str | None:
|
|
208
|
+
return Config.get().get_string_value('user')
|
|
209
|
+
|
|
210
|
+
@user.setter
|
|
211
|
+
def user(self, user: str | None) -> None:
|
|
212
|
+
if user is None:
|
|
213
|
+
if 'PIXELTABLE_USER' in os.environ:
|
|
214
|
+
del os.environ['PIXELTABLE_USER']
|
|
215
|
+
else:
|
|
216
|
+
os.environ['PIXELTABLE_USER'] = user
|
|
217
|
+
|
|
218
|
+
@property
|
|
219
|
+
def default_time_zone(self) -> ZoneInfo | None:
|
|
140
220
|
return self._default_time_zone
|
|
141
221
|
|
|
142
222
|
@default_time_zone.setter
|
|
143
|
-
def default_time_zone(self, tz:
|
|
223
|
+
def default_time_zone(self, tz: ZoneInfo | None) -> None:
|
|
144
224
|
"""
|
|
145
225
|
This is not a publicly visible setter; it is only for testing purposes.
|
|
146
226
|
"""
|
|
147
|
-
|
|
227
|
+
if tz is None:
|
|
228
|
+
tz_name = self._get_tz_name()
|
|
229
|
+
else:
|
|
230
|
+
assert isinstance(tz, ZoneInfo)
|
|
231
|
+
tz_name = tz.key
|
|
148
232
|
self.engine.dispose()
|
|
149
233
|
self._create_engine(time_zone_name=tz_name)
|
|
150
234
|
|
|
235
|
+
@property
|
|
236
|
+
def verbosity(self) -> int:
|
|
237
|
+
return self._verbosity
|
|
238
|
+
|
|
239
|
+
@property
|
|
240
|
+
def conn(self) -> sql.Connection | None:
|
|
241
|
+
assert self._current_conn is not None
|
|
242
|
+
return self._current_conn
|
|
243
|
+
|
|
244
|
+
@property
|
|
245
|
+
def session(self) -> orm.Session | None:
|
|
246
|
+
assert self._current_session is not None
|
|
247
|
+
return self._current_session
|
|
248
|
+
|
|
249
|
+
@property
|
|
250
|
+
def dbms(self) -> Dbms | None:
|
|
251
|
+
assert self._dbms is not None
|
|
252
|
+
return self._dbms
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def is_using_cockroachdb(self) -> bool:
|
|
256
|
+
assert self._dbms is not None
|
|
257
|
+
return isinstance(self._dbms, CockroachDbms)
|
|
258
|
+
|
|
259
|
+
@property
|
|
260
|
+
def in_xact(self) -> bool:
|
|
261
|
+
return self._current_conn is not None
|
|
262
|
+
|
|
263
|
+
@property
|
|
264
|
+
def is_local(self) -> bool:
|
|
265
|
+
assert self._db_url is not None # is_local should be called only after db initialization
|
|
266
|
+
return self._db_server is not None
|
|
267
|
+
|
|
268
|
+
@contextmanager
|
|
269
|
+
def begin_xact(self, *, for_write: bool = False) -> Iterator[sql.Connection]:
|
|
270
|
+
"""
|
|
271
|
+
Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
|
|
272
|
+
|
|
273
|
+
for_write: if True, uses serializable isolation; if False, uses repeatable_read
|
|
274
|
+
|
|
275
|
+
TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
|
|
276
|
+
that avoids tripping over any pending ops
|
|
277
|
+
"""
|
|
278
|
+
if self._current_conn is None:
|
|
279
|
+
assert self._current_session is None
|
|
280
|
+
try:
|
|
281
|
+
self._current_isolation_level = self.SERIALIZABLE_ISOLATION_LEVEL
|
|
282
|
+
with (
|
|
283
|
+
self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
|
|
284
|
+
orm.Session(conn) as session,
|
|
285
|
+
conn.begin(),
|
|
286
|
+
):
|
|
287
|
+
self._current_conn = conn
|
|
288
|
+
self._current_session = session
|
|
289
|
+
yield conn
|
|
290
|
+
finally:
|
|
291
|
+
self._current_session = None
|
|
292
|
+
self._current_conn = None
|
|
293
|
+
self._current_isolation_level = None
|
|
294
|
+
else:
|
|
295
|
+
assert self._current_session is not None
|
|
296
|
+
assert self._current_isolation_level == self.SERIALIZABLE_ISOLATION_LEVEL or not for_write
|
|
297
|
+
yield self._current_conn
|
|
298
|
+
|
|
151
299
|
def configure_logging(
|
|
152
300
|
self,
|
|
153
301
|
*,
|
|
154
|
-
to_stdout:
|
|
155
|
-
level:
|
|
156
|
-
add:
|
|
157
|
-
remove:
|
|
302
|
+
to_stdout: bool | None = None,
|
|
303
|
+
level: int | None = None,
|
|
304
|
+
add: str | None = None,
|
|
305
|
+
remove: str | None = None,
|
|
158
306
|
) -> None:
|
|
159
307
|
"""Configure logging.
|
|
160
308
|
|
|
@@ -196,7 +344,7 @@ class Env:
|
|
|
196
344
|
def set_log_level(self, level: int) -> None:
|
|
197
345
|
self._default_log_level = level
|
|
198
346
|
|
|
199
|
-
def set_module_log_level(self, module: str, level:
|
|
347
|
+
def set_module_log_level(self, module: str, level: int | None) -> None:
|
|
200
348
|
if level is None:
|
|
201
349
|
self._module_log_level.pop(module, None)
|
|
202
350
|
else:
|
|
@@ -211,14 +359,37 @@ class Env:
|
|
|
211
359
|
# accept log messages from a configured pixeltable module (at any level of the module hierarchy)
|
|
212
360
|
path_parts = list(Path(record.pathname).parts)
|
|
213
361
|
path_parts.reverse()
|
|
362
|
+
if 'pixeltable' not in path_parts:
|
|
363
|
+
return False
|
|
214
364
|
max_idx = path_parts.index('pixeltable')
|
|
215
365
|
for module_name in path_parts[:max_idx]:
|
|
216
366
|
if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
|
|
217
367
|
return True
|
|
218
|
-
|
|
219
|
-
|
|
368
|
+
return record.levelno >= self._default_log_level
|
|
369
|
+
|
|
370
|
+
@property
|
|
371
|
+
def console_logger(self) -> ConsoleLogger:
|
|
372
|
+
return self._console_logger
|
|
373
|
+
|
|
374
|
+
def _get_tz_name(self) -> str:
|
|
375
|
+
"""Get the time zone name from the configuration, or the system local time zone if not specified.
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
str: The time zone name.
|
|
379
|
+
"""
|
|
380
|
+
tz_name = Config.get().get_string_value('time_zone')
|
|
381
|
+
if tz_name is not None:
|
|
382
|
+
# Validate tzname
|
|
383
|
+
if not isinstance(tz_name, str):
|
|
384
|
+
self._logger.error('Invalid time zone specified in configuration.')
|
|
385
|
+
else:
|
|
386
|
+
try:
|
|
387
|
+
_ = ZoneInfo(tz_name)
|
|
388
|
+
except ZoneInfoNotFoundError:
|
|
389
|
+
self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
|
|
220
390
|
else:
|
|
221
|
-
|
|
391
|
+
tz_name = tzlocal.get_localzone_name()
|
|
392
|
+
return tz_name
|
|
222
393
|
|
|
223
394
|
def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
|
|
224
395
|
if self._initialized:
|
|
@@ -226,55 +397,57 @@ class Env:
|
|
|
226
397
|
|
|
227
398
|
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
|
228
399
|
|
|
400
|
+
config = Config.get()
|
|
401
|
+
|
|
229
402
|
self._initialized = True
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
self.
|
|
233
|
-
self.
|
|
234
|
-
self.
|
|
235
|
-
self.
|
|
236
|
-
|
|
237
|
-
self.
|
|
238
|
-
self.
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
# we don't have our logger set up yet, so print to stdout
|
|
245
|
-
print(f'Creating a Pixeltable instance at: {self._home}')
|
|
246
|
-
self._home.mkdir()
|
|
247
|
-
# TODO (aaron-siegel) This is the existing behavior, but it seems scary. If something happens to
|
|
248
|
-
# self._home, it will cause the DB to be destroyed even if pgdata is in an alternate location.
|
|
249
|
-
# PROPOSAL: require `reinit_db` to be set explicitly to destroy the DB.
|
|
250
|
-
reinit_db = True
|
|
251
|
-
|
|
252
|
-
if not self._media_dir.exists():
|
|
253
|
-
self._media_dir.mkdir()
|
|
254
|
-
if not self._file_cache_dir.exists():
|
|
255
|
-
self._file_cache_dir.mkdir()
|
|
256
|
-
if not self._dataset_cache_dir.exists():
|
|
257
|
-
self._dataset_cache_dir.mkdir()
|
|
258
|
-
if not self._log_dir.exists():
|
|
259
|
-
self._log_dir.mkdir()
|
|
260
|
-
if not self._tmp_dir.exists():
|
|
261
|
-
self._tmp_dir.mkdir()
|
|
262
|
-
|
|
263
|
-
# Read in the config
|
|
264
|
-
self._config = Config.from_file(self._config_file)
|
|
265
|
-
self._file_cache_size_g = self._config.get_float_value('file_cache_size_g')
|
|
403
|
+
|
|
404
|
+
self._media_dir = Config.get().home / 'media'
|
|
405
|
+
self._file_cache_dir = Config.get().home / 'file_cache'
|
|
406
|
+
self._dataset_cache_dir = Config.get().home / 'dataset_cache'
|
|
407
|
+
self._log_dir = Config.get().home / 'logs'
|
|
408
|
+
self._tmp_dir = Config.get().home / 'tmp'
|
|
409
|
+
|
|
410
|
+
self._media_dir.mkdir(exist_ok=True)
|
|
411
|
+
self._file_cache_dir.mkdir(exist_ok=True)
|
|
412
|
+
self._dataset_cache_dir.mkdir(exist_ok=True)
|
|
413
|
+
self._log_dir.mkdir(exist_ok=True)
|
|
414
|
+
self._tmp_dir.mkdir(exist_ok=True)
|
|
415
|
+
|
|
416
|
+
self._file_cache_size_g = config.get_float_value('file_cache_size_g')
|
|
266
417
|
if self._file_cache_size_g is None:
|
|
267
418
|
raise excs.Error(
|
|
268
419
|
'pixeltable/file_cache_size_g is missing from configuration\n'
|
|
269
|
-
f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {
|
|
420
|
+
f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {Config.get().config_file},\n'
|
|
270
421
|
'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
|
|
271
422
|
)
|
|
272
423
|
|
|
424
|
+
self._default_input_media_dest = config.get_string_value('input_media_dest')
|
|
425
|
+
self._default_output_media_dest = config.get_string_value('output_media_dest')
|
|
426
|
+
for mode, uri in (('input', self._default_input_media_dest), ('output', self._default_output_media_dest)):
|
|
427
|
+
if uri is not None:
|
|
428
|
+
try:
|
|
429
|
+
_ = ObjectPath.parse_object_storage_addr(uri, False)
|
|
430
|
+
except Exception as e:
|
|
431
|
+
raise excs.Error(f'Invalid {mode} media destination URI: {uri}') from e
|
|
432
|
+
|
|
433
|
+
self._pxt_api_key = config.get_string_value('api_key')
|
|
434
|
+
|
|
273
435
|
# Disable spurious warnings
|
|
274
436
|
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
275
|
-
if
|
|
437
|
+
if config.get_bool_value('hide_warnings'):
|
|
276
438
|
# Disable more warnings
|
|
277
439
|
warnings.simplefilter('ignore', category=UserWarning)
|
|
440
|
+
warnings.simplefilter('ignore', category=FutureWarning)
|
|
441
|
+
|
|
442
|
+
# Set verbosity level for user visible console messages
|
|
443
|
+
self._verbosity = config.get_int_value('verbosity')
|
|
444
|
+
if self._verbosity is None:
|
|
445
|
+
self._verbosity = 1
|
|
446
|
+
stdout_handler = ConsoleOutputHandler(stream=stdout)
|
|
447
|
+
stdout_handler.setLevel(map_level(self._verbosity))
|
|
448
|
+
stdout_handler.addFilter(ConsoleMessageFilter())
|
|
449
|
+
self._logger.addHandler(stdout_handler)
|
|
450
|
+
self._console_logger = ConsoleLogger(self._logger)
|
|
278
451
|
|
|
279
452
|
# configure _logger to log to a file
|
|
280
453
|
self._logfilename = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log'
|
|
@@ -304,33 +477,21 @@ class Env:
|
|
|
304
477
|
http_logger.addHandler(http_fh)
|
|
305
478
|
http_logger.propagate = False
|
|
306
479
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
os.remove(path)
|
|
310
|
-
|
|
311
|
-
self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
|
|
312
|
-
self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(self._home / 'pgdata')))
|
|
480
|
+
self.clear_tmp_dir()
|
|
481
|
+
tz_name = self._get_tz_name()
|
|
313
482
|
|
|
314
|
-
#
|
|
315
|
-
self.
|
|
316
|
-
self._db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
|
|
483
|
+
# configure pixeltable database
|
|
484
|
+
self._init_db(config)
|
|
317
485
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
self._logger.error(f'Invalid time zone specified in configuration.')
|
|
323
|
-
else:
|
|
324
|
-
try:
|
|
325
|
-
_ = ZoneInfo(tz_name)
|
|
326
|
-
except ZoneInfoNotFoundError:
|
|
327
|
-
self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
|
|
486
|
+
if reinit_db and not self.is_local:
|
|
487
|
+
raise excs.Error(
|
|
488
|
+
'Reinitializing pixeltable database is not supported when running in non-local environment'
|
|
489
|
+
)
|
|
328
490
|
|
|
329
491
|
if reinit_db and self._store_db_exists():
|
|
330
492
|
self._drop_store_db()
|
|
331
493
|
|
|
332
494
|
create_db = not self._store_db_exists()
|
|
333
|
-
|
|
334
495
|
if create_db:
|
|
335
496
|
self._logger.info(f'creating database at: {self.db_url}')
|
|
336
497
|
self._create_store_db()
|
|
@@ -340,38 +501,104 @@ class Env:
|
|
|
340
501
|
# Create the SQLAlchemy engine. This will also set the default time zone.
|
|
341
502
|
self._create_engine(time_zone_name=tz_name, echo=echo)
|
|
342
503
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
schema.base_metadata.create_all(self._sa_engine)
|
|
346
|
-
metadata.create_system_info(self._sa_engine)
|
|
504
|
+
# Create catalog tables and system metadata
|
|
505
|
+
self._init_metadata()
|
|
347
506
|
|
|
348
|
-
|
|
507
|
+
self.console_logger.info(f'Connected to Pixeltable database at: {self.db_url}')
|
|
349
508
|
|
|
350
509
|
# we now have a home directory and db; start other services
|
|
351
510
|
self._set_up_runtime()
|
|
352
511
|
self.log_to_stdout(False)
|
|
353
512
|
|
|
354
|
-
def
|
|
355
|
-
|
|
513
|
+
def _init_db(self, config: Config) -> None:
|
|
514
|
+
"""
|
|
515
|
+
Initialize the pixeltable database along with its associated DBMS.
|
|
516
|
+
"""
|
|
517
|
+
db_connect_str = config.get_string_value('DB_CONNECT_STR')
|
|
518
|
+
if db_connect_str is not None:
|
|
519
|
+
try:
|
|
520
|
+
db_url = sql.make_url(db_connect_str)
|
|
521
|
+
except sql.exc.ArgumentError as e:
|
|
522
|
+
error = f'Invalid db connection string {db_connect_str}: {e}'
|
|
523
|
+
self._logger.error(error)
|
|
524
|
+
raise excs.Error(error) from e
|
|
525
|
+
self._db_url = db_url.render_as_string(hide_password=False)
|
|
526
|
+
self._db_name = db_url.database # use the dbname given in connect string
|
|
527
|
+
dialect = db_url.get_dialect().name
|
|
528
|
+
if dialect == 'cockroachdb':
|
|
529
|
+
self._dbms = CockroachDbms(db_url)
|
|
530
|
+
else:
|
|
531
|
+
raise excs.Error(f'Unsupported DBMS {dialect}')
|
|
532
|
+
# Check if database exists
|
|
533
|
+
if not self._store_db_exists():
|
|
534
|
+
error = f'Database {self._db_name!r} does not exist'
|
|
535
|
+
self._logger.error(error)
|
|
536
|
+
raise excs.Error(error)
|
|
537
|
+
self._logger.info(f'Using database at: {self.db_url}')
|
|
538
|
+
else:
|
|
539
|
+
self._db_name = config.get_string_value('db') or 'pixeltable'
|
|
540
|
+
self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(Config.get().home / 'pgdata')))
|
|
541
|
+
# cleanup_mode=None will leave the postgres process running after Python exits
|
|
542
|
+
# cleanup_mode='stop' will terminate the postgres process when Python exits
|
|
543
|
+
# On Windows, we need cleanup_mode='stop' because child processes are killed automatically when the parent
|
|
544
|
+
# process (such as Terminal or VSCode) exits, potentially leaving it in an unusable state.
|
|
545
|
+
cleanup_mode = 'stop' if platform.system() == 'Windows' else None
|
|
546
|
+
self._db_server = pixeltable_pgserver.get_server(self._pgdata_dir, cleanup_mode=cleanup_mode)
|
|
547
|
+
self._db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
|
|
548
|
+
self._dbms = PostgresqlDbms(sql.make_url(self._db_url))
|
|
549
|
+
assert self._dbms is not None
|
|
550
|
+
assert self._db_url is not None
|
|
551
|
+
assert self._db_name is not None
|
|
552
|
+
|
|
553
|
+
@retry(
|
|
554
|
+
stop=stop_after_attempt(3), # Stop after 3 attempts
|
|
555
|
+
wait=wait_exponential_jitter(initial=0.2, max=1.0, jitter=0.2), # Exponential backoff with jitter
|
|
556
|
+
)
|
|
557
|
+
def _init_metadata(self) -> None:
|
|
558
|
+
"""
|
|
559
|
+
Create pixeltable metadata tables and system metadata.
|
|
560
|
+
This is an idempotent operation.
|
|
561
|
+
|
|
562
|
+
Retry logic handles race conditions when multiple Pixeltable processes
|
|
563
|
+
attempt to initialize metadata tables simultaneously. The first process may succeed
|
|
564
|
+
in creating tables while others encounter database constraints (e.g., "table already exists").
|
|
565
|
+
Exponential backoff with jitter reduces contention between competing processes.
|
|
566
|
+
"""
|
|
567
|
+
assert self._sa_engine is not None
|
|
568
|
+
from pixeltable import metadata
|
|
569
|
+
|
|
570
|
+
self._logger.debug('Creating pixeltable metadata')
|
|
571
|
+
metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
|
|
572
|
+
metadata.create_system_info(self._sa_engine)
|
|
573
|
+
|
|
574
|
+
def _create_engine(self, time_zone_name: str, echo: bool = False) -> None:
|
|
575
|
+
# Add timezone option to connection string
|
|
576
|
+
updated_url = add_option_to_db_url(self.db_url, f'-c timezone={time_zone_name}')
|
|
577
|
+
|
|
356
578
|
self._sa_engine = sql.create_engine(
|
|
357
|
-
self.
|
|
358
|
-
echo=echo,
|
|
359
|
-
future=True,
|
|
360
|
-
isolation_level='AUTOCOMMIT',
|
|
361
|
-
connect_args=connect_args,
|
|
579
|
+
updated_url, echo=echo, isolation_level=self._dbms.transaction_isolation_level
|
|
362
580
|
)
|
|
581
|
+
|
|
363
582
|
self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
|
|
583
|
+
self._logger.info(f'Engine dialect: {self._sa_engine.dialect.name}')
|
|
584
|
+
self._logger.info(f'Engine driver : {self._sa_engine.dialect.driver}')
|
|
585
|
+
|
|
364
586
|
with self.engine.begin() as conn:
|
|
365
587
|
tz_name = conn.execute(sql.text('SHOW TIME ZONE')).scalar()
|
|
366
588
|
assert isinstance(tz_name, str)
|
|
367
589
|
self._logger.info(f'Database time zone is now: {tz_name}')
|
|
368
590
|
self._default_time_zone = ZoneInfo(tz_name)
|
|
591
|
+
if self.is_using_cockroachdb:
|
|
592
|
+
# This could be set when the database is created, but we set it now
|
|
593
|
+
conn.execute(sql.text('SET null_ordered_last = true;'))
|
|
594
|
+
null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
|
|
595
|
+
assert isinstance(null_ordered_last, str)
|
|
596
|
+
self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
|
|
369
597
|
|
|
370
598
|
def _store_db_exists(self) -> bool:
|
|
371
599
|
assert self._db_name is not None
|
|
372
600
|
# don't try to connect to self.db_name, it may not exist
|
|
373
|
-
|
|
374
|
-
engine = sql.create_engine(db_url, future=True)
|
|
601
|
+
engine = sql.create_engine(self._dbms.default_system_db_url(), future=True)
|
|
375
602
|
try:
|
|
376
603
|
with engine.begin() as conn:
|
|
377
604
|
stmt = f"SELECT COUNT(*) FROM pg_database WHERE datname = '{self._db_name}'"
|
|
@@ -384,53 +611,55 @@ class Env:
|
|
|
384
611
|
def _create_store_db(self) -> None:
|
|
385
612
|
assert self._db_name is not None
|
|
386
613
|
# create the db
|
|
387
|
-
|
|
388
|
-
engine = sql.create_engine(pg_db_url, future=True, isolation_level='AUTOCOMMIT')
|
|
614
|
+
engine = sql.create_engine(self._dbms.default_system_db_url(), future=True, isolation_level='AUTOCOMMIT')
|
|
389
615
|
preparer = engine.dialect.identifier_preparer
|
|
390
616
|
try:
|
|
391
617
|
with engine.begin() as conn:
|
|
392
|
-
|
|
393
|
-
stmt = (
|
|
394
|
-
f"CREATE DATABASE {preparer.quote(self._db_name)} "
|
|
395
|
-
"ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
|
|
396
|
-
)
|
|
618
|
+
stmt = self._dbms.create_db_stmt(preparer.quote(self._db_name))
|
|
397
619
|
conn.execute(sql.text(stmt))
|
|
398
620
|
finally:
|
|
399
621
|
engine.dispose()
|
|
400
622
|
|
|
401
623
|
# enable pgvector
|
|
402
|
-
|
|
403
|
-
engine = sql.create_engine(store_db_url, future=True, isolation_level='AUTOCOMMIT')
|
|
624
|
+
engine = sql.create_engine(self.db_url, future=True, isolation_level='AUTOCOMMIT')
|
|
404
625
|
try:
|
|
405
626
|
with engine.begin() as conn:
|
|
406
627
|
conn.execute(sql.text('CREATE EXTENSION vector'))
|
|
407
628
|
finally:
|
|
408
629
|
engine.dispose()
|
|
409
630
|
|
|
631
|
+
def _pgserver_terminate_connections_stmt(self) -> str:
|
|
632
|
+
return f"""
|
|
633
|
+
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
634
|
+
FROM pg_stat_activity
|
|
635
|
+
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
636
|
+
AND pid <> pg_backend_pid()
|
|
637
|
+
"""
|
|
638
|
+
|
|
410
639
|
def _drop_store_db(self) -> None:
|
|
411
640
|
assert self._db_name is not None
|
|
412
|
-
|
|
413
|
-
engine = sql.create_engine(db_url, future=True, isolation_level='AUTOCOMMIT')
|
|
641
|
+
engine = sql.create_engine(self._dbms.default_system_db_url(), future=True, isolation_level='AUTOCOMMIT')
|
|
414
642
|
preparer = engine.dialect.identifier_preparer
|
|
415
643
|
try:
|
|
416
644
|
with engine.begin() as conn:
|
|
417
645
|
# terminate active connections
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
FROM pg_stat_activity
|
|
421
|
-
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
422
|
-
AND pid <> pg_backend_pid()
|
|
423
|
-
""")
|
|
424
|
-
conn.execute(sql.text(stmt))
|
|
646
|
+
if self._db_server is not None:
|
|
647
|
+
conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
|
|
425
648
|
# drop db
|
|
426
|
-
stmt =
|
|
649
|
+
stmt = self._dbms.drop_db_stmt(preparer.quote(self._db_name))
|
|
427
650
|
conn.execute(sql.text(stmt))
|
|
428
651
|
finally:
|
|
429
652
|
engine.dispose()
|
|
430
653
|
|
|
431
654
|
def _upgrade_metadata(self) -> None:
|
|
655
|
+
from pixeltable import metadata
|
|
656
|
+
|
|
432
657
|
metadata.upgrade_md(self._sa_engine)
|
|
433
658
|
|
|
659
|
+
@property
|
|
660
|
+
def pxt_api_key(self) -> str | None:
|
|
661
|
+
return self._pxt_api_key
|
|
662
|
+
|
|
434
663
|
def get_client(self, name: str) -> Any:
|
|
435
664
|
"""
|
|
436
665
|
Gets the client with the specified name, initializing it if necessary.
|
|
@@ -438,35 +667,51 @@ class Env:
|
|
|
438
667
|
Args:
|
|
439
668
|
- name: The name of the client
|
|
440
669
|
"""
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
670
|
+
# Return the existing client if it has already been constructed
|
|
671
|
+
with _registered_clients_lock:
|
|
672
|
+
cl = _registered_clients[name]
|
|
673
|
+
if cl.client_obj is not None:
|
|
674
|
+
return cl.client_obj # Already initialized
|
|
675
|
+
|
|
676
|
+
# Retrieve parameters required to construct the requested client.
|
|
677
|
+
init_kwargs: dict[str, Any] = {}
|
|
678
|
+
for param in cl.params.values():
|
|
679
|
+
# Determine the type of the parameter for proper config parsing.
|
|
680
|
+
pname = param.name
|
|
681
|
+
t = param.annotation
|
|
682
|
+
# Deference T | None
|
|
683
|
+
if typing.get_origin(t) in (typing.Union, types.UnionType):
|
|
684
|
+
args = typing.get_args(t)
|
|
685
|
+
if args[0] is type(None):
|
|
686
|
+
t = args[1]
|
|
687
|
+
elif args[1] is type(None):
|
|
688
|
+
t = args[0]
|
|
689
|
+
assert isinstance(t, type), t
|
|
690
|
+
arg: Any = Config.get().get_value(pname, t, section=name)
|
|
691
|
+
if arg is not None:
|
|
692
|
+
init_kwargs[pname] = arg
|
|
693
|
+
elif param.default is inspect.Parameter.empty:
|
|
453
694
|
raise excs.Error(
|
|
454
|
-
f'`{name}` client not initialized: parameter `{
|
|
455
|
-
f'To fix this, specify the `{name.upper()}_{
|
|
456
|
-
f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
695
|
+
f'`{name}` client not initialized: parameter `{pname}` is not configured.\n'
|
|
696
|
+
f'To fix this, specify the `{name.upper()}_{pname.upper()}` environment variable, '
|
|
697
|
+
f'or put `{pname.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
457
698
|
)
|
|
458
699
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
700
|
+
# Construct the requested client
|
|
701
|
+
with _registered_clients_lock:
|
|
702
|
+
if cl.client_obj is not None:
|
|
703
|
+
return cl.client_obj # Already initialized
|
|
704
|
+
cl.client_obj = cl.init_fn(**init_kwargs)
|
|
705
|
+
self._logger.info(f'Initialized `{name}` client with parameters: {init_kwargs}.')
|
|
706
|
+
return cl.client_obj
|
|
462
707
|
|
|
463
708
|
def _start_web_server(self) -> None:
|
|
464
709
|
"""
|
|
465
710
|
The http server root is the file system root.
|
|
466
711
|
eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
|
|
467
|
-
|
|
468
|
-
This arrangement enables serving
|
|
469
|
-
as well as external
|
|
712
|
+
On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
|
|
713
|
+
This arrangement enables serving objects hosted within _home,
|
|
714
|
+
as well as external objects inserted into pixeltable or produced by pixeltable.
|
|
470
715
|
The port is chosen dynamically to prevent conflicts.
|
|
471
716
|
"""
|
|
472
717
|
# Port 0 means OS picks one for us.
|
|
@@ -474,7 +719,7 @@ class Env:
|
|
|
474
719
|
port = self._httpd.server_address[1]
|
|
475
720
|
self._http_address = f'http://127.0.0.1:{port}'
|
|
476
721
|
|
|
477
|
-
def run_server():
|
|
722
|
+
def run_server() -> None:
|
|
478
723
|
logging.log(logging.INFO, f'running web server at {self._http_address}')
|
|
479
724
|
self._httpd.serve_forever()
|
|
480
725
|
|
|
@@ -484,30 +729,77 @@ class Env:
|
|
|
484
729
|
|
|
485
730
|
def _set_up_runtime(self) -> None:
|
|
486
731
|
"""Check for and start runtime services"""
|
|
732
|
+
register_heif_opener()
|
|
487
733
|
self._start_web_server()
|
|
488
734
|
self.__register_packages()
|
|
489
|
-
|
|
490
|
-
|
|
735
|
+
|
|
736
|
+
@property
|
|
737
|
+
def default_video_encoder(self) -> str | None:
|
|
738
|
+
if self._default_video_encoder is None:
|
|
739
|
+
self._default_video_encoder = self._determine_default_video_encoder()
|
|
740
|
+
return self._default_video_encoder
|
|
741
|
+
|
|
742
|
+
def _determine_default_video_encoder(self) -> str | None:
|
|
743
|
+
"""
|
|
744
|
+
Returns the first available encoder from a list of candidates.
|
|
745
|
+
|
|
746
|
+
TODO:
|
|
747
|
+
- the user might prefer a hardware-accelerated encoder (eg, h264_nvenc or h264_videotoolbox)
|
|
748
|
+
- allow user override via a config option 'video_encoder'
|
|
749
|
+
"""
|
|
750
|
+
# look for available encoders, in this order
|
|
751
|
+
candidates = [
|
|
752
|
+
'libx264', # GPL, best quality
|
|
753
|
+
'libopenh264', # BSD
|
|
754
|
+
]
|
|
755
|
+
|
|
756
|
+
try:
|
|
757
|
+
# Get list of available encoders
|
|
758
|
+
result = subprocess.run(['ffmpeg', '-encoders'], capture_output=True, text=True, timeout=10, check=True)
|
|
759
|
+
|
|
760
|
+
if result.returncode == 0:
|
|
761
|
+
available_encoders = result.stdout
|
|
762
|
+
for encoder in candidates:
|
|
763
|
+
# ffmpeg -encoders output format: " V..... encoder_name description"
|
|
764
|
+
if f' {encoder} ' in available_encoders:
|
|
765
|
+
_logger.debug(f'Using H.264 encoder: {encoder}')
|
|
766
|
+
return encoder
|
|
767
|
+
except Exception:
|
|
768
|
+
pass
|
|
769
|
+
return None
|
|
491
770
|
|
|
492
771
|
def __register_packages(self) -> None:
|
|
493
772
|
"""Declare optional packages that are utilized by some parts of the code."""
|
|
773
|
+
self.__register_package('accelerate')
|
|
494
774
|
self.__register_package('anthropic')
|
|
775
|
+
self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
|
|
495
776
|
self.__register_package('boto3')
|
|
496
777
|
self.__register_package('datasets')
|
|
778
|
+
self.__register_package('diffusers')
|
|
497
779
|
self.__register_package('fiftyone')
|
|
780
|
+
self.__register_package('twelvelabs')
|
|
781
|
+
self.__register_package('fal_client', library_name='fal-client')
|
|
498
782
|
self.__register_package('fireworks', library_name='fireworks-ai')
|
|
783
|
+
self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
|
|
784
|
+
self.__register_package('google.genai', library_name='google-genai')
|
|
785
|
+
self.__register_package('groq')
|
|
499
786
|
self.__register_package('huggingface_hub', library_name='huggingface-hub')
|
|
500
787
|
self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
|
|
788
|
+
self.__register_package('librosa')
|
|
501
789
|
self.__register_package('llama_cpp', library_name='llama-cpp-python')
|
|
790
|
+
self.__register_package('mcp')
|
|
502
791
|
self.__register_package('mistralai')
|
|
503
792
|
self.__register_package('mistune')
|
|
504
793
|
self.__register_package('ollama')
|
|
505
794
|
self.__register_package('openai')
|
|
506
795
|
self.__register_package('openpyxl')
|
|
507
796
|
self.__register_package('pyarrow')
|
|
797
|
+
self.__register_package('pydantic')
|
|
508
798
|
self.__register_package('replicate')
|
|
799
|
+
self.__register_package('reve')
|
|
509
800
|
self.__register_package('sentencepiece')
|
|
510
801
|
self.__register_package('sentence_transformers', library_name='sentence-transformers')
|
|
802
|
+
self.__register_package('soundfile')
|
|
511
803
|
self.__register_package('spacy')
|
|
512
804
|
self.__register_package('tiktoken')
|
|
513
805
|
self.__register_package('together')
|
|
@@ -515,17 +807,30 @@ class Env:
|
|
|
515
807
|
self.__register_package('torchaudio')
|
|
516
808
|
self.__register_package('torchvision')
|
|
517
809
|
self.__register_package('transformers')
|
|
810
|
+
self.__register_package('voyageai')
|
|
518
811
|
self.__register_package('whisper', library_name='openai-whisper')
|
|
519
812
|
self.__register_package('whisperx')
|
|
520
|
-
self.__register_package('yolox', library_name='
|
|
813
|
+
self.__register_package('yolox', library_name='pixeltable-yolox')
|
|
814
|
+
self.__register_package('lancedb')
|
|
815
|
+
self.__register_package('scenedetect')
|
|
521
816
|
|
|
522
|
-
def __register_package(self, package_name: str, library_name:
|
|
817
|
+
def __register_package(self, package_name: str, library_name: str | None = None) -> None:
|
|
818
|
+
is_installed: bool
|
|
819
|
+
try:
|
|
820
|
+
is_installed = importlib.util.find_spec(package_name) is not None
|
|
821
|
+
except ModuleNotFoundError:
|
|
822
|
+
# This can happen if the parent of `package_name` is not installed.
|
|
823
|
+
is_installed = False
|
|
523
824
|
self.__optional_packages[package_name] = PackageInfo(
|
|
524
|
-
is_installed=
|
|
525
|
-
library_name=library_name or package_name # defaults to package_name unless specified otherwise
|
|
825
|
+
is_installed=is_installed,
|
|
826
|
+
library_name=library_name or package_name, # defaults to package_name unless specified otherwise
|
|
526
827
|
)
|
|
527
828
|
|
|
528
|
-
def
|
|
829
|
+
def require_binary(self, binary_name: str) -> None:
|
|
830
|
+
if not shutil.which(binary_name):
|
|
831
|
+
raise excs.Error(f'{binary_name} is not installed or not in PATH. Please install it to use this feature.')
|
|
832
|
+
|
|
833
|
+
def require_package(self, package_name: str, min_version: list[int] | None = None) -> None:
|
|
529
834
|
"""
|
|
530
835
|
Checks whether the specified optional package is available. If not, raises an exception
|
|
531
836
|
with an error message informing the user how to install it.
|
|
@@ -542,7 +847,8 @@ class Env:
|
|
|
542
847
|
if not package_info.is_installed:
|
|
543
848
|
# Still not found.
|
|
544
849
|
raise excs.Error(
|
|
545
|
-
f'This feature requires the `{package_name}` package. To install it, run:
|
|
850
|
+
f'This feature requires the `{package_name}` package. To install it, run: '
|
|
851
|
+
f'`pip install -U {package_info.library_name}`'
|
|
546
852
|
)
|
|
547
853
|
|
|
548
854
|
if min_version is None:
|
|
@@ -555,56 +861,41 @@ class Env:
|
|
|
555
861
|
|
|
556
862
|
if min_version > package_info.version:
|
|
557
863
|
raise excs.Error(
|
|
558
|
-
f'The installed version of package `{package_name}` is
|
|
864
|
+
f'The installed version of package `{package_name}` is '
|
|
865
|
+
f'{".".join(str(v) for v in package_info.version)}, '
|
|
559
866
|
f'but version >={".".join(str(v) for v in min_version)} is required. '
|
|
560
867
|
f'To fix this, run: `pip install -U {package_info.library_name}`'
|
|
561
868
|
)
|
|
562
869
|
|
|
563
|
-
def
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
import spacy
|
|
570
|
-
from spacy.cli.download import get_model_filename
|
|
571
|
-
spacy_model = 'en_core_web_sm'
|
|
572
|
-
spacy_model_version = '3.7.1'
|
|
573
|
-
filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
|
|
574
|
-
url = f'{spacy.about.__download_url__}/{filename}'
|
|
575
|
-
# Try to `pip install` the model. We set check=False; if the pip command fails, it's not necessarily
|
|
576
|
-
# a problem, because the model have been installed on a previous attempt.
|
|
577
|
-
self._logger.info(f'Ensuring spaCy model is installed: {filename}')
|
|
578
|
-
ret = subprocess.run([sys.executable, '-m', 'pip', 'install', '-qU', url], check=False)
|
|
579
|
-
if ret.returncode != 0:
|
|
580
|
-
self._logger.warn(f'pip install failed for spaCy model: {filename}')
|
|
581
|
-
try:
|
|
582
|
-
self._logger.info(f'Loading spaCy model: {spacy_model}')
|
|
583
|
-
self._spacy_nlp = spacy.load(spacy_model)
|
|
584
|
-
except Exception as exc:
|
|
585
|
-
self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
|
|
586
|
-
warnings.warn(
|
|
587
|
-
f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
|
|
588
|
-
excs.PixeltableWarning
|
|
589
|
-
)
|
|
590
|
-
self.__optional_packages['spacy'].is_installed = False
|
|
591
|
-
|
|
592
|
-
def num_tmp_files(self) -> int:
|
|
593
|
-
return len(glob.glob(f'{self._tmp_dir}/*'))
|
|
594
|
-
|
|
595
|
-
def create_tmp_path(self, extension: str = '') -> Path:
|
|
596
|
-
return self._tmp_dir / f'{uuid.uuid4()}{extension}'
|
|
870
|
+
def clear_tmp_dir(self) -> None:
|
|
871
|
+
for path in glob.glob(f'{self._tmp_dir}/*'):
|
|
872
|
+
if os.path.isdir(path):
|
|
873
|
+
shutil.rmtree(path)
|
|
874
|
+
else:
|
|
875
|
+
os.remove(path)
|
|
597
876
|
|
|
598
|
-
|
|
599
|
-
def
|
|
600
|
-
|
|
601
|
-
|
|
877
|
+
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Type[T] | None) -> T:
|
|
878
|
+
def get_resource_pool_info(self, pool_id: str, make_pool_info: Callable[[], T] | None = None) -> T:
|
|
879
|
+
"""Returns the info object for the given id, creating it if necessary."""
|
|
880
|
+
info = self._resource_pool_info.get(pool_id)
|
|
881
|
+
if info is None and make_pool_info is not None:
|
|
882
|
+
info = make_pool_info()
|
|
883
|
+
self._resource_pool_info[pool_id] = info
|
|
884
|
+
return info
|
|
602
885
|
|
|
603
886
|
@property
|
|
604
887
|
def media_dir(self) -> Path:
|
|
605
888
|
assert self._media_dir is not None
|
|
606
889
|
return self._media_dir
|
|
607
890
|
|
|
891
|
+
@property
|
|
892
|
+
def default_input_media_dest(self) -> str | None:
|
|
893
|
+
return self._default_input_media_dest
|
|
894
|
+
|
|
895
|
+
@property
|
|
896
|
+
def default_output_media_dest(self) -> str | None:
|
|
897
|
+
return self._default_output_media_dest
|
|
898
|
+
|
|
608
899
|
@property
|
|
609
900
|
def file_cache_dir(self) -> Path:
|
|
610
901
|
assert self._file_cache_dir is not None
|
|
@@ -628,9 +919,86 @@ class Env:
|
|
|
628
919
|
@property
|
|
629
920
|
def spacy_nlp(self) -> spacy.Language:
|
|
630
921
|
Env.get().require_package('spacy')
|
|
922
|
+
if self._spacy_nlp is None:
|
|
923
|
+
self.__init_spacy()
|
|
631
924
|
assert self._spacy_nlp is not None
|
|
632
925
|
return self._spacy_nlp
|
|
633
926
|
|
|
927
|
+
def __init_spacy(self) -> None:
|
|
928
|
+
"""
|
|
929
|
+
spaCy relies on a pip-installed model to operate. In order to avoid requiring the model as a separate
|
|
930
|
+
dependency, we install it programmatically here. This should cause no problems, since the model packages
|
|
931
|
+
have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
|
|
932
|
+
"""
|
|
933
|
+
import spacy
|
|
934
|
+
from spacy.cli.download import download
|
|
935
|
+
|
|
936
|
+
spacy_model = 'en_core_web_sm'
|
|
937
|
+
self._logger.info(f'Ensuring spaCy model is installed: {spacy_model}')
|
|
938
|
+
download(spacy_model)
|
|
939
|
+
self._logger.info(f'Loading spaCy model: {spacy_model}')
|
|
940
|
+
try:
|
|
941
|
+
self._spacy_nlp = spacy.load(spacy_model)
|
|
942
|
+
except Exception as exc:
|
|
943
|
+
raise excs.Error(f'Failed to load spaCy model: {spacy_model}') from exc
|
|
944
|
+
|
|
945
|
+
def _clean_up(self) -> None:
|
|
946
|
+
"""
|
|
947
|
+
Internal cleanup method that properly closes all resources and resets state.
|
|
948
|
+
This is called before destroying the singleton instance.
|
|
949
|
+
"""
|
|
950
|
+
assert self._current_session is None
|
|
951
|
+
assert self._current_conn is None
|
|
952
|
+
|
|
953
|
+
# Stop HTTP server
|
|
954
|
+
if self._httpd is not None:
|
|
955
|
+
try:
|
|
956
|
+
self._httpd.shutdown()
|
|
957
|
+
self._httpd.server_close()
|
|
958
|
+
except Exception as e:
|
|
959
|
+
_logger.warning(f'Error stopping HTTP server: {e}')
|
|
960
|
+
|
|
961
|
+
# First terminate all connections to the database
|
|
962
|
+
if self._db_server is not None:
|
|
963
|
+
assert self._dbms is not None
|
|
964
|
+
assert self._db_name is not None
|
|
965
|
+
try:
|
|
966
|
+
temp_engine = sql.create_engine(self._dbms.default_system_db_url(), isolation_level='AUTOCOMMIT')
|
|
967
|
+
try:
|
|
968
|
+
with temp_engine.begin() as conn:
|
|
969
|
+
conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
|
|
970
|
+
_logger.info(f"Terminated all connections to database '{self._db_name}'")
|
|
971
|
+
except Exception as e:
|
|
972
|
+
_logger.warning(f'Error terminating database connections: {e}')
|
|
973
|
+
finally:
|
|
974
|
+
temp_engine.dispose()
|
|
975
|
+
except Exception as e:
|
|
976
|
+
_logger.warning(f'Error stopping database server: {e}')
|
|
977
|
+
|
|
978
|
+
# Dispose of SQLAlchemy engine (after stopping db server)
|
|
979
|
+
if self._sa_engine is not None:
|
|
980
|
+
try:
|
|
981
|
+
self._sa_engine.dispose()
|
|
982
|
+
except Exception as e:
|
|
983
|
+
_logger.warning(f'Error disposing engine: {e}')
|
|
984
|
+
|
|
985
|
+
# Close event loop
|
|
986
|
+
if self._event_loop is not None:
|
|
987
|
+
try:
|
|
988
|
+
if self._event_loop.is_running():
|
|
989
|
+
self._event_loop.stop()
|
|
990
|
+
self._event_loop.close()
|
|
991
|
+
except Exception as e:
|
|
992
|
+
_logger.warning(f'Error closing event loop: {e}')
|
|
993
|
+
|
|
994
|
+
# Remove logging handlers
|
|
995
|
+
for handler in self._logger.handlers[:]:
|
|
996
|
+
try:
|
|
997
|
+
handler.close()
|
|
998
|
+
self._logger.removeHandler(handler)
|
|
999
|
+
except Exception as e:
|
|
1000
|
+
_logger.warning(f'Error removing handler: {e}')
|
|
1001
|
+
|
|
634
1002
|
|
|
635
1003
|
def register_client(name: str) -> Callable:
|
|
636
1004
|
"""Decorator that registers a third-party API client for use by Pixeltable.
|
|
@@ -656,100 +1024,183 @@ def register_client(name: str) -> Callable:
|
|
|
656
1024
|
Args:
|
|
657
1025
|
- name (str): The name of the API client (e.g., 'openai' or 'label-studio').
|
|
658
1026
|
"""
|
|
1027
|
+
|
|
659
1028
|
def decorator(fn: Callable) -> None:
|
|
660
|
-
global _registered_clients
|
|
661
1029
|
sig = inspect.signature(fn)
|
|
662
|
-
|
|
663
|
-
|
|
1030
|
+
params = dict(sig.parameters)
|
|
1031
|
+
with _registered_clients_lock:
|
|
1032
|
+
_registered_clients[name] = ApiClient(init_fn=fn, params=params)
|
|
664
1033
|
|
|
665
1034
|
return decorator
|
|
666
1035
|
|
|
667
1036
|
|
|
668
|
-
|
|
1037
|
+
_registered_clients_lock: threading.Lock = threading.Lock()
|
|
1038
|
+
_registered_clients: dict[str, ApiClient] = {}
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
@dataclass
|
|
1042
|
+
class ApiClient:
|
|
1043
|
+
init_fn: Callable
|
|
1044
|
+
params: dict[str, inspect.Parameter]
|
|
1045
|
+
client_obj: Any | None = None
|
|
1046
|
+
|
|
1047
|
+
|
|
1048
|
+
@dataclass
|
|
1049
|
+
class PackageInfo:
|
|
1050
|
+
is_installed: bool
|
|
1051
|
+
library_name: str # pypi library name (may be different from package name)
|
|
1052
|
+
version: list[int] | None = None # installed version, as a list of components (such as [3,0,2] for "3.0.2")
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
TIME_FORMAT = '%H:%M.%S %f'
|
|
1056
|
+
# As far as rate limiting goes, we try not go lower than 5% of the capacity because we don't have perfect information
|
|
1057
|
+
# about the rate limits and the usage
|
|
1058
|
+
TARGET_RATE_LIMIT_RESOURCE_FRACT = 0.05
|
|
1059
|
+
|
|
1060
|
+
|
|
1061
|
+
@dataclass
|
|
1062
|
+
class RateLimitsInfo:
|
|
669
1063
|
"""
|
|
670
|
-
|
|
671
|
-
|
|
1064
|
+
Abstract base class for resource pools made up of rate limits for different resources.
|
|
1065
|
+
|
|
1066
|
+
Rate limits and currently remaining resources are periodically reported via record().
|
|
1067
|
+
|
|
1068
|
+
Subclasses provide operational customization via:
|
|
1069
|
+
- get_retry_delay()
|
|
1070
|
+
- get_request_resources(self, ...) -> dict[str, int]
|
|
1071
|
+
with parameters that are a subset of those of the udf that creates the subclass's instance
|
|
672
1072
|
"""
|
|
673
|
-
__config: dict[str, Any]
|
|
674
1073
|
|
|
675
|
-
|
|
1074
|
+
# get_request_resources:
|
|
1075
|
+
# - Returns estimated resources needed for a specific request (ie, a single udf call) as a dict (key: resource name)
|
|
1076
|
+
# - parameters are a subset of those of the udf
|
|
1077
|
+
# - this is not a class method because the signature depends on the instantiating udf
|
|
1078
|
+
get_request_resources: Callable[..., dict[str, int]]
|
|
676
1079
|
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
"""
|
|
680
|
-
Loads configuration from the specified TOML file. If the file does not exist, it will be
|
|
681
|
-
created and populated with the default configuration.
|
|
682
|
-
"""
|
|
683
|
-
if os.path.isfile(path):
|
|
684
|
-
with open(path, 'r') as stream:
|
|
685
|
-
try:
|
|
686
|
-
config_dict = toml.load(stream)
|
|
687
|
-
except Exception as exc:
|
|
688
|
-
raise excs.Error(f'Could not read config file: {str(path)}') from exc
|
|
689
|
-
else:
|
|
690
|
-
config_dict = cls.__create_default_config(path)
|
|
691
|
-
with open(path, 'w') as stream:
|
|
692
|
-
try:
|
|
693
|
-
toml.dump(config_dict, stream)
|
|
694
|
-
except Exception as exc:
|
|
695
|
-
raise excs.Error(f'Could not write config file: {str(path)}') from exc
|
|
696
|
-
logging.getLogger('pixeltable').info(f'Created default config file at: {str(path)}')
|
|
697
|
-
return cls(config_dict)
|
|
1080
|
+
resource_limits: dict[str, RateLimitInfo] = field(default_factory=dict)
|
|
1081
|
+
has_exc: bool = False
|
|
698
1082
|
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
|
|
702
|
-
# Default cache size is 1/5 of free disk space
|
|
703
|
-
file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
|
|
704
|
-
return {
|
|
705
|
-
'pixeltable': {
|
|
706
|
-
'file_cache_size_g': round(file_cache_size_g, 1),
|
|
707
|
-
'hide_warnings': False,
|
|
708
|
-
}
|
|
709
|
-
}
|
|
710
|
-
|
|
711
|
-
def __init__(self, config: dict[str, Any]) -> None:
|
|
712
|
-
self.__config = config
|
|
713
|
-
|
|
714
|
-
def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
|
|
715
|
-
env_var = f'{section.upper()}_{key.upper()}'
|
|
716
|
-
if env_var in os.environ:
|
|
717
|
-
value = os.environ[env_var]
|
|
718
|
-
elif section in self.__config and key in self.__config[section]:
|
|
719
|
-
value = self.__config[section][key]
|
|
720
|
-
else:
|
|
721
|
-
return None
|
|
1083
|
+
def debug_str(self) -> str:
|
|
1084
|
+
return ','.join(info.debug_str() for info in self.resource_limits.values())
|
|
722
1085
|
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
except ValueError:
|
|
726
|
-
raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
|
|
1086
|
+
def is_initialized(self) -> bool:
|
|
1087
|
+
return len(self.resource_limits) > 0
|
|
727
1088
|
|
|
728
|
-
def
|
|
729
|
-
|
|
1089
|
+
def reset(self) -> None:
|
|
1090
|
+
self.resource_limits.clear()
|
|
730
1091
|
|
|
731
|
-
def
|
|
732
|
-
|
|
1092
|
+
def record(self, request_ts: datetime.datetime, reset_exc: bool = False, **kwargs: Any) -> None:
|
|
1093
|
+
"""Update self.resource_limits with the provided rate limit info.
|
|
1094
|
+
Args:
|
|
1095
|
+
- request_ts: time at which the request was made
|
|
1096
|
+
- reset_exc: if True, reset the has_exc flag
|
|
1097
|
+
"""
|
|
1098
|
+
if len(self.resource_limits) == 0:
|
|
1099
|
+
self.resource_limits = {k: RateLimitInfo(k, request_ts, *v) for k, v in kwargs.items() if v is not None}
|
|
1100
|
+
# TODO: remove
|
|
1101
|
+
for info in self.resource_limits.values():
|
|
1102
|
+
_logger.debug(f'Updated resource state: {info}')
|
|
1103
|
+
else:
|
|
1104
|
+
if self.has_exc and not reset_exc:
|
|
1105
|
+
# ignore updates until we're asked to reset
|
|
1106
|
+
_logger.debug(f'rate_limits.record(): ignoring update {kwargs}')
|
|
1107
|
+
return
|
|
1108
|
+
self.has_exc = False
|
|
1109
|
+
for k, v in kwargs.items():
|
|
1110
|
+
if v is not None:
|
|
1111
|
+
self.resource_limits[k].update(request_ts, *v)
|
|
1112
|
+
_logger.debug(f'Updated resource state: {self.resource_limits[k]}')
|
|
1113
|
+
|
|
1114
|
+
def record_exc(self, request_ts: datetime.datetime, exc: Exception) -> None:
|
|
1115
|
+
"""Update self.resource_limits based on the exception headers
|
|
1116
|
+
Args:
|
|
1117
|
+
- request_ts: time at which the request that caused the exception was made
|
|
1118
|
+
- exc: the exception raised"""
|
|
1119
|
+
self.has_exc = True
|
|
1120
|
+
|
|
1121
|
+
def get_retry_delay(self, exc: Exception, attempt: int) -> float | None:
|
|
1122
|
+
"""Returns number of seconds to wait before retry, or None if not retryable"""
|
|
1123
|
+
# Find the highest wait until at least 5% availability of all resources
|
|
1124
|
+
max_wait = 0.0
|
|
1125
|
+
for limit_info in self.resource_limits.values():
|
|
1126
|
+
time_until = limit_info.estimated_resource_refill_delay(
|
|
1127
|
+
math.ceil(TARGET_RATE_LIMIT_RESOURCE_FRACT * limit_info.limit)
|
|
1128
|
+
)
|
|
1129
|
+
if time_until is not None:
|
|
1130
|
+
max_wait = max(max_wait, time_until)
|
|
1131
|
+
return max_wait if max_wait > 0 else None
|
|
733
1132
|
|
|
734
|
-
def get_float_value(self, key: str, section: str = 'pixeltable') -> Optional[float]:
|
|
735
|
-
return self.get_value(key, float, section)
|
|
736
1133
|
|
|
737
|
-
|
|
738
|
-
|
|
1134
|
+
@dataclass
|
|
1135
|
+
class RateLimitInfo:
|
|
1136
|
+
"""Container for rate limit-related information for a single resource."""
|
|
1137
|
+
|
|
1138
|
+
resource: str
|
|
1139
|
+
request_start_ts: datetime.datetime
|
|
1140
|
+
limit: int
|
|
1141
|
+
remaining: int
|
|
1142
|
+
reset_at: datetime.datetime
|
|
1143
|
+
|
|
1144
|
+
def debug_str(self) -> str:
|
|
1145
|
+
return (
|
|
1146
|
+
f'{self.resource}@{self.request_start_ts.strftime(TIME_FORMAT)}: '
|
|
1147
|
+
f'{self.limit}/{self.remaining}/{self.reset_at.strftime(TIME_FORMAT)}'
|
|
1148
|
+
)
|
|
739
1149
|
|
|
1150
|
+
def update(
|
|
1151
|
+
self, request_start_ts: datetime.datetime, limit: int, remaining: int, reset_at: datetime.datetime
|
|
1152
|
+
) -> None:
|
|
1153
|
+
# Responses can come out of order, especially for failed requests. We need to be careful not to overwrite
|
|
1154
|
+
# the current state with less up-to-date information. We use request_start_ts as a proxy for rate limit info
|
|
1155
|
+
# recency.
|
|
1156
|
+
if self.request_start_ts > request_start_ts:
|
|
1157
|
+
# The current state is more up-to-date than the update
|
|
1158
|
+
_logger.debug(
|
|
1159
|
+
f'Ignoring out-of-date update for {self.resource}. Current request_start_ts: '
|
|
1160
|
+
f'{self.request_start_ts}, update: {request_start_ts}'
|
|
1161
|
+
)
|
|
1162
|
+
return
|
|
1163
|
+
self.request_start_ts = request_start_ts
|
|
1164
|
+
self.limit = limit
|
|
1165
|
+
self.remaining = remaining
|
|
1166
|
+
self.reset_at = reset_at
|
|
1167
|
+
|
|
1168
|
+
def estimated_resource_refill_delay(self, target_remaining: int) -> float | None:
|
|
1169
|
+
"""Estimate time in seconds until remaining resources reaches target_remaining.
|
|
1170
|
+
Assumes linear replenishment of resources over time.
|
|
1171
|
+
Returns None if unable to estimate.
|
|
1172
|
+
"""
|
|
1173
|
+
if self.remaining >= target_remaining:
|
|
1174
|
+
return 0
|
|
1175
|
+
if self.request_start_ts >= self.reset_at:
|
|
1176
|
+
return 0
|
|
1177
|
+
if self.limit < target_remaining:
|
|
1178
|
+
return None
|
|
740
1179
|
|
|
741
|
-
|
|
1180
|
+
# Estimate resource refill rate based on the recorded state and timestamps. Assumes linear refill.
|
|
1181
|
+
refill_rate = (self.limit - self.remaining) / (self.reset_at - self.request_start_ts).total_seconds()
|
|
1182
|
+
assert refill_rate > 0, f'self={self}, target_remaining={target_remaining}'
|
|
742
1183
|
|
|
1184
|
+
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
1185
|
+
time_until = (target_remaining - self.remaining) / refill_rate - (now - self.request_start_ts).total_seconds()
|
|
1186
|
+
return max(0, math.ceil(time_until))
|
|
743
1187
|
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
1188
|
+
def __repr__(self) -> str:
|
|
1189
|
+
return (
|
|
1190
|
+
f'RateLimitInfo(resource={self.resource}, request_start_ts={self.request_start_ts}, '
|
|
1191
|
+
f'remaining={self.remaining}/{self.limit} ({(100 * self.remaining / self.limit):.1f}%), '
|
|
1192
|
+
f'reset_at={self.reset_at})'
|
|
1193
|
+
)
|
|
749
1194
|
|
|
750
1195
|
|
|
751
1196
|
@dataclass
|
|
752
|
-
class
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
1197
|
+
class RuntimeCtx:
|
|
1198
|
+
"""
|
|
1199
|
+
Container for runtime data provided by the execution system to udfs.
|
|
1200
|
+
|
|
1201
|
+
Udfs that accept the special _runtime_ctx parameter receive an instance of this class.
|
|
1202
|
+
"""
|
|
1203
|
+
|
|
1204
|
+
# Indicates a retry attempt following a rate limit error (error code: 429). Requires a 'rate-limits' resource pool.
|
|
1205
|
+
# If True, call RateLimitsInfo.record() with reset_exc=True.
|
|
1206
|
+
is_retry: bool = False
|