pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/env.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  import datetime
4
5
  import glob
5
6
  import http.server
@@ -7,68 +8,101 @@ import importlib
7
8
  import importlib.util
8
9
  import inspect
9
10
  import logging
11
+ import math
10
12
  import os
13
+ import platform
11
14
  import shutil
12
15
  import subprocess
13
16
  import sys
14
17
  import threading
15
- import uuid
18
+ import types
19
+ import typing
16
20
  import warnings
17
- from dataclasses import dataclass
21
+ from contextlib import contextmanager
22
+ from dataclasses import dataclass, field
18
23
  from pathlib import Path
19
- from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar
24
+ from sys import stdout
25
+ from typing import TYPE_CHECKING, Any, Callable, Iterator, TypeVar
20
26
  from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
21
27
 
28
+ import nest_asyncio # type: ignore[import-untyped]
22
29
  import pixeltable_pgserver
23
30
  import sqlalchemy as sql
24
- import toml
31
+ import tzlocal
32
+ from pillow_heif import register_heif_opener # type: ignore[import-untyped]
33
+ from sqlalchemy import orm
34
+ from tenacity import retry, stop_after_attempt, wait_exponential_jitter
25
35
  from tqdm import TqdmWarning
26
36
 
27
- import pixeltable.exceptions as excs
28
- from pixeltable import metadata
37
+ from pixeltable import exceptions as excs
38
+ from pixeltable.config import Config
39
+ from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
40
+ from pixeltable.utils.dbms import CockroachDbms, Dbms, PostgresqlDbms
29
41
  from pixeltable.utils.http_server import make_server
42
+ from pixeltable.utils.object_stores import ObjectPath
43
+ from pixeltable.utils.sql import add_option_to_db_url
30
44
 
31
45
  if TYPE_CHECKING:
32
46
  import spacy
33
47
 
34
48
 
49
+ _logger = logging.getLogger('pixeltable')
50
+
51
+ T = TypeVar('T')
52
+
53
+
35
54
  class Env:
36
55
  """
37
- Store for runtime globals.
56
+ Store runtime globals for both local and non-local environments.
57
+ For a local environment, Pixeltable uses an embedded PostgreSQL server that runs locally in a separate process.
58
+ For a non-local environment, Pixeltable uses a connection string to the externally managed database.
38
59
  """
39
60
 
40
- _instance: Optional[Env] = None
61
+ SERIALIZABLE_ISOLATION_LEVEL = 'SERIALIZABLE'
62
+
63
+ _instance: Env | None = None
64
+ __initializing: bool = False
41
65
  _log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
42
66
 
43
- _home: Optional[Path]
44
- _media_dir: Optional[Path]
45
- _file_cache_dir: Optional[Path] # cached media files with external URL
46
- _dataset_cache_dir: Optional[Path] # cached datasets (eg, pytorch or COCO)
47
- _log_dir: Optional[Path] # log files
48
- _tmp_dir: Optional[Path] # any tmp files
49
- _sa_engine: Optional[sql.engine.base.Engine]
50
- _pgdata_dir: Optional[Path]
51
- _db_name: Optional[str]
52
- _db_server: Optional[pixeltable_pgserver.PostgresServer]
53
- _db_url: Optional[str]
54
- _default_time_zone: Optional[ZoneInfo]
67
+ _media_dir: Path | None
68
+ _file_cache_dir: Path | None # cached object files with external URL
69
+ _dataset_cache_dir: Path | None # cached datasets (eg, pytorch or COCO)
70
+ _log_dir: Path | None # log files
71
+ _tmp_dir: Path | None # any tmp files
72
+ _sa_engine: sql.engine.base.Engine | None
73
+ _pgdata_dir: Path | None
74
+ _db_name: str | None
75
+ _db_server: pixeltable_pgserver.PostgresServer | None # set only when running in local environment
76
+ _db_url: str | None
77
+ _default_time_zone: ZoneInfo | None
78
+ _verbosity: int
55
79
 
56
80
  # info about optional packages that are utilized by some parts of the code
57
81
  __optional_packages: dict[str, PackageInfo]
58
82
 
59
- _spacy_nlp: Optional[spacy.Language]
60
- _httpd: Optional[http.server.HTTPServer]
61
- _http_address: Optional[str]
83
+ _spacy_nlp: spacy.Language | None
84
+ _httpd: http.server.HTTPServer | None
85
+ _http_address: str | None
62
86
  _logger: logging.Logger
63
87
  _default_log_level: int
64
- _logfilename: Optional[str]
88
+ _logfilename: str | None
65
89
  _log_to_stdout: bool
66
90
  _module_log_level: dict[str, int] # module name -> log level
67
- _config_file: Optional[Path]
68
- _config: Optional[Config]
91
+ _file_cache_size_g: float
92
+ _default_input_media_dest: str | None
93
+ _default_output_media_dest: str | None
94
+ _pxt_api_key: str | None
69
95
  _stdout_handler: logging.StreamHandler
96
+ _default_video_encoder: str | None
70
97
  _initialized: bool
71
98
 
99
+ _resource_pool_info: dict[str, Any]
100
+ _current_conn: sql.Connection | None
101
+ _current_session: orm.Session | None
102
+ _current_isolation_level: str | None
103
+ _dbms: Dbms | None
104
+ _event_loop: asyncio.AbstractEventLoop | None # event loop for ExecNode
105
+
72
106
  @classmethod
73
107
  def get(cls) -> Env:
74
108
  if cls._instance is None:
@@ -77,15 +111,26 @@ class Env:
77
111
 
78
112
  @classmethod
79
113
  def _init_env(cls, reinit_db: bool = False) -> None:
114
+ assert not cls.__initializing, 'Circular env initialization detected.'
115
+ cls.__initializing = True
116
+ if cls._instance is not None:
117
+ cls._instance._clean_up()
118
+ cls._instance = None
80
119
  env = Env()
81
- env._set_up(reinit_db=reinit_db)
82
- env._upgrade_metadata()
83
- cls._instance = env
120
+ try:
121
+ env._set_up(reinit_db=reinit_db)
122
+ env._upgrade_metadata()
123
+ cls._instance = env
124
+ finally:
125
+ # Reset the initializing flag, even if setup fails.
126
+ # This prevents the environment from being left in a broken state.
127
+ cls.__initializing = False
128
+
129
+ def __init__(self) -> None:
130
+ assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
84
131
 
85
- def __init__(self):
86
- self._home = None
87
132
  self._media_dir = None # computed media files
88
- self._file_cache_dir = None # cached media files with external URL
133
+ self._file_cache_dir = None # cached object files with external URL
89
134
  self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
90
135
  self._log_dir = None # log files
91
136
  self._tmp_dir = None # any tmp files
@@ -95,11 +140,11 @@ class Env:
95
140
  self._db_server = None
96
141
  self._db_url = None
97
142
  self._default_time_zone = None
98
-
99
143
  self.__optional_packages = {}
100
144
  self._spacy_nlp = None
101
145
  self._httpd = None
102
146
  self._http_address = None
147
+ self._default_video_encoder = None
103
148
 
104
149
  # logging-related state
105
150
  self._logger = logging.getLogger('pixeltable')
@@ -111,19 +156,42 @@ class Env:
111
156
  self._log_to_stdout = False
112
157
  self._module_log_level = {} # module name -> log level
113
158
 
114
- # config
115
- self._config_file = None
116
- self._config = None
117
-
118
159
  # create logging handler to also log to stdout
119
160
  self._stdout_handler = logging.StreamHandler(stream=sys.stdout)
120
161
  self._stdout_handler.setFormatter(logging.Formatter(self._log_fmt_str))
121
162
  self._initialized = False
122
163
 
164
+ self._resource_pool_info = {}
165
+ self._current_conn = None
166
+ self._current_session = None
167
+ self._current_isolation_level = None
168
+ self._dbms = None
169
+ self._event_loop = None
170
+
171
+ def _init_event_loop(self) -> None:
172
+ try:
173
+ # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
174
+ # multiple run_until_complete()
175
+ running_loop = asyncio.get_running_loop()
176
+ self._event_loop = running_loop
177
+ _logger.debug('Patched running loop')
178
+ except RuntimeError:
179
+ self._event_loop = asyncio.new_event_loop()
180
+ asyncio.set_event_loop(self._event_loop)
181
+ # we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
182
+ self._event_loop.slow_callback_duration = 3600
183
+
184
+ # always allow nested event loops, we need that to run async udfs synchronously (eg, for SimilarityExpr);
185
+ # see run_coroutine_synchronously()
186
+ nest_asyncio.apply()
187
+ if _logger.isEnabledFor(logging.DEBUG):
188
+ self._event_loop.set_debug(True)
189
+
123
190
  @property
124
- def config(self) -> Config:
125
- assert self._config is not None
126
- return self._config
191
+ def event_loop(self) -> asyncio.AbstractEventLoop:
192
+ if self._event_loop is None:
193
+ self._init_event_loop()
194
+ return self._event_loop
127
195
 
128
196
  @property
129
197
  def db_url(self) -> str:
@@ -136,25 +204,105 @@ class Env:
136
204
  return self._http_address
137
205
 
138
206
  @property
139
- def default_time_zone(self) -> Optional[ZoneInfo]:
207
+ def user(self) -> str | None:
208
+ return Config.get().get_string_value('user')
209
+
210
+ @user.setter
211
+ def user(self, user: str | None) -> None:
212
+ if user is None:
213
+ if 'PIXELTABLE_USER' in os.environ:
214
+ del os.environ['PIXELTABLE_USER']
215
+ else:
216
+ os.environ['PIXELTABLE_USER'] = user
217
+
218
+ @property
219
+ def default_time_zone(self) -> ZoneInfo | None:
140
220
  return self._default_time_zone
141
221
 
142
222
  @default_time_zone.setter
143
- def default_time_zone(self, tz: Optional[ZoneInfo]) -> None:
223
+ def default_time_zone(self, tz: ZoneInfo | None) -> None:
144
224
  """
145
225
  This is not a publicly visible setter; it is only for testing purposes.
146
226
  """
147
- tz_name = None if tz is None else tz.key
227
+ if tz is None:
228
+ tz_name = self._get_tz_name()
229
+ else:
230
+ assert isinstance(tz, ZoneInfo)
231
+ tz_name = tz.key
148
232
  self.engine.dispose()
149
233
  self._create_engine(time_zone_name=tz_name)
150
234
 
235
+ @property
236
+ def verbosity(self) -> int:
237
+ return self._verbosity
238
+
239
+ @property
240
+ def conn(self) -> sql.Connection | None:
241
+ assert self._current_conn is not None
242
+ return self._current_conn
243
+
244
+ @property
245
+ def session(self) -> orm.Session | None:
246
+ assert self._current_session is not None
247
+ return self._current_session
248
+
249
+ @property
250
+ def dbms(self) -> Dbms | None:
251
+ assert self._dbms is not None
252
+ return self._dbms
253
+
254
+ @property
255
+ def is_using_cockroachdb(self) -> bool:
256
+ assert self._dbms is not None
257
+ return isinstance(self._dbms, CockroachDbms)
258
+
259
+ @property
260
+ def in_xact(self) -> bool:
261
+ return self._current_conn is not None
262
+
263
+ @property
264
+ def is_local(self) -> bool:
265
+ assert self._db_url is not None # is_local should be called only after db initialization
266
+ return self._db_server is not None
267
+
268
+ @contextmanager
269
+ def begin_xact(self, *, for_write: bool = False) -> Iterator[sql.Connection]:
270
+ """
271
+ Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
272
+
273
+ for_write: if True, uses serializable isolation; if False, uses repeatable_read
274
+
275
+ TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
276
+ that avoids tripping over any pending ops
277
+ """
278
+ if self._current_conn is None:
279
+ assert self._current_session is None
280
+ try:
281
+ self._current_isolation_level = self.SERIALIZABLE_ISOLATION_LEVEL
282
+ with (
283
+ self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
284
+ orm.Session(conn) as session,
285
+ conn.begin(),
286
+ ):
287
+ self._current_conn = conn
288
+ self._current_session = session
289
+ yield conn
290
+ finally:
291
+ self._current_session = None
292
+ self._current_conn = None
293
+ self._current_isolation_level = None
294
+ else:
295
+ assert self._current_session is not None
296
+ assert self._current_isolation_level == self.SERIALIZABLE_ISOLATION_LEVEL or not for_write
297
+ yield self._current_conn
298
+
151
299
  def configure_logging(
152
300
  self,
153
301
  *,
154
- to_stdout: Optional[bool] = None,
155
- level: Optional[int] = None,
156
- add: Optional[str] = None,
157
- remove: Optional[str] = None,
302
+ to_stdout: bool | None = None,
303
+ level: int | None = None,
304
+ add: str | None = None,
305
+ remove: str | None = None,
158
306
  ) -> None:
159
307
  """Configure logging.
160
308
 
@@ -196,7 +344,7 @@ class Env:
196
344
  def set_log_level(self, level: int) -> None:
197
345
  self._default_log_level = level
198
346
 
199
- def set_module_log_level(self, module: str, level: Optional[int]) -> None:
347
+ def set_module_log_level(self, module: str, level: int | None) -> None:
200
348
  if level is None:
201
349
  self._module_log_level.pop(module, None)
202
350
  else:
@@ -211,14 +359,37 @@ class Env:
211
359
  # accept log messages from a configured pixeltable module (at any level of the module hierarchy)
212
360
  path_parts = list(Path(record.pathname).parts)
213
361
  path_parts.reverse()
362
+ if 'pixeltable' not in path_parts:
363
+ return False
214
364
  max_idx = path_parts.index('pixeltable')
215
365
  for module_name in path_parts[:max_idx]:
216
366
  if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
217
367
  return True
218
- if record.levelno >= self._default_log_level:
219
- return True
368
+ return record.levelno >= self._default_log_level
369
+
370
+ @property
371
+ def console_logger(self) -> ConsoleLogger:
372
+ return self._console_logger
373
+
374
+ def _get_tz_name(self) -> str:
375
+ """Get the time zone name from the configuration, or the system local time zone if not specified.
376
+
377
+ Returns:
378
+ str: The time zone name.
379
+ """
380
+ tz_name = Config.get().get_string_value('time_zone')
381
+ if tz_name is not None:
382
+ # Validate tzname
383
+ if not isinstance(tz_name, str):
384
+ self._logger.error('Invalid time zone specified in configuration.')
385
+ else:
386
+ try:
387
+ _ = ZoneInfo(tz_name)
388
+ except ZoneInfoNotFoundError:
389
+ self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
220
390
  else:
221
- return False
391
+ tz_name = tzlocal.get_localzone_name()
392
+ return tz_name
222
393
 
223
394
  def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
224
395
  if self._initialized:
@@ -226,55 +397,57 @@ class Env:
226
397
 
227
398
  os.environ['TOKENIZERS_PARALLELISM'] = 'false'
228
399
 
400
+ config = Config.get()
401
+
229
402
  self._initialized = True
230
- home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
231
- assert self._home is None or self._home == home
232
- self._home = home
233
- self._config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self._home / 'config.toml')))
234
- self._media_dir = self._home / 'media'
235
- self._file_cache_dir = self._home / 'file_cache'
236
- self._dataset_cache_dir = self._home / 'dataset_cache'
237
- self._log_dir = self._home / 'logs'
238
- self._tmp_dir = self._home / 'tmp'
239
-
240
- if self._home.exists() and not self._home.is_dir():
241
- raise RuntimeError(f'{self._home} is not a directory')
242
-
243
- if not self._home.exists():
244
- # we don't have our logger set up yet, so print to stdout
245
- print(f'Creating a Pixeltable instance at: {self._home}')
246
- self._home.mkdir()
247
- # TODO (aaron-siegel) This is the existing behavior, but it seems scary. If something happens to
248
- # self._home, it will cause the DB to be destroyed even if pgdata is in an alternate location.
249
- # PROPOSAL: require `reinit_db` to be set explicitly to destroy the DB.
250
- reinit_db = True
251
-
252
- if not self._media_dir.exists():
253
- self._media_dir.mkdir()
254
- if not self._file_cache_dir.exists():
255
- self._file_cache_dir.mkdir()
256
- if not self._dataset_cache_dir.exists():
257
- self._dataset_cache_dir.mkdir()
258
- if not self._log_dir.exists():
259
- self._log_dir.mkdir()
260
- if not self._tmp_dir.exists():
261
- self._tmp_dir.mkdir()
262
-
263
- # Read in the config
264
- self._config = Config.from_file(self._config_file)
265
- self._file_cache_size_g = self._config.get_float_value('file_cache_size_g')
403
+
404
+ self._media_dir = Config.get().home / 'media'
405
+ self._file_cache_dir = Config.get().home / 'file_cache'
406
+ self._dataset_cache_dir = Config.get().home / 'dataset_cache'
407
+ self._log_dir = Config.get().home / 'logs'
408
+ self._tmp_dir = Config.get().home / 'tmp'
409
+
410
+ self._media_dir.mkdir(exist_ok=True)
411
+ self._file_cache_dir.mkdir(exist_ok=True)
412
+ self._dataset_cache_dir.mkdir(exist_ok=True)
413
+ self._log_dir.mkdir(exist_ok=True)
414
+ self._tmp_dir.mkdir(exist_ok=True)
415
+
416
+ self._file_cache_size_g = config.get_float_value('file_cache_size_g')
266
417
  if self._file_cache_size_g is None:
267
418
  raise excs.Error(
268
419
  'pixeltable/file_cache_size_g is missing from configuration\n'
269
- f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {self._config_file},\n'
420
+ f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {Config.get().config_file},\n'
270
421
  'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
271
422
  )
272
423
 
424
+ self._default_input_media_dest = config.get_string_value('input_media_dest')
425
+ self._default_output_media_dest = config.get_string_value('output_media_dest')
426
+ for mode, uri in (('input', self._default_input_media_dest), ('output', self._default_output_media_dest)):
427
+ if uri is not None:
428
+ try:
429
+ _ = ObjectPath.parse_object_storage_addr(uri, False)
430
+ except Exception as e:
431
+ raise excs.Error(f'Invalid {mode} media destination URI: {uri}') from e
432
+
433
+ self._pxt_api_key = config.get_string_value('api_key')
434
+
273
435
  # Disable spurious warnings
274
436
  warnings.simplefilter('ignore', category=TqdmWarning)
275
- if self._config.get_bool_value('hide_warnings'):
437
+ if config.get_bool_value('hide_warnings'):
276
438
  # Disable more warnings
277
439
  warnings.simplefilter('ignore', category=UserWarning)
440
+ warnings.simplefilter('ignore', category=FutureWarning)
441
+
442
+ # Set verbosity level for user visible console messages
443
+ self._verbosity = config.get_int_value('verbosity')
444
+ if self._verbosity is None:
445
+ self._verbosity = 1
446
+ stdout_handler = ConsoleOutputHandler(stream=stdout)
447
+ stdout_handler.setLevel(map_level(self._verbosity))
448
+ stdout_handler.addFilter(ConsoleMessageFilter())
449
+ self._logger.addHandler(stdout_handler)
450
+ self._console_logger = ConsoleLogger(self._logger)
278
451
 
279
452
  # configure _logger to log to a file
280
453
  self._logfilename = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log'
@@ -304,33 +477,21 @@ class Env:
304
477
  http_logger.addHandler(http_fh)
305
478
  http_logger.propagate = False
306
479
 
307
- # empty tmp dir
308
- for path in glob.glob(f'{self._tmp_dir}/*'):
309
- os.remove(path)
310
-
311
- self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
312
- self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(self._home / 'pgdata')))
480
+ self.clear_tmp_dir()
481
+ tz_name = self._get_tz_name()
313
482
 
314
- # in pixeltable_pgserver.get_server(): cleanup_mode=None will leave db on for debugging purposes
315
- self._db_server = pixeltable_pgserver.get_server(self._pgdata_dir, cleanup_mode=None)
316
- self._db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
483
+ # configure pixeltable database
484
+ self._init_db(config)
317
485
 
318
- tz_name = self.config.get_string_value('time_zone')
319
- if tz_name is not None:
320
- # Validate tzname
321
- if not isinstance(tz_name, str):
322
- self._logger.error(f'Invalid time zone specified in configuration.')
323
- else:
324
- try:
325
- _ = ZoneInfo(tz_name)
326
- except ZoneInfoNotFoundError:
327
- self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
486
+ if reinit_db and not self.is_local:
487
+ raise excs.Error(
488
+ 'Reinitializing pixeltable database is not supported when running in non-local environment'
489
+ )
328
490
 
329
491
  if reinit_db and self._store_db_exists():
330
492
  self._drop_store_db()
331
493
 
332
494
  create_db = not self._store_db_exists()
333
-
334
495
  if create_db:
335
496
  self._logger.info(f'creating database at: {self.db_url}')
336
497
  self._create_store_db()
@@ -340,38 +501,104 @@ class Env:
340
501
  # Create the SQLAlchemy engine. This will also set the default time zone.
341
502
  self._create_engine(time_zone_name=tz_name, echo=echo)
342
503
 
343
- if create_db:
344
- from pixeltable.metadata import schema
345
- schema.base_metadata.create_all(self._sa_engine)
346
- metadata.create_system_info(self._sa_engine)
504
+ # Create catalog tables and system metadata
505
+ self._init_metadata()
347
506
 
348
- print(f'Connected to Pixeltable database at: {self.db_url}')
507
+ self.console_logger.info(f'Connected to Pixeltable database at: {self.db_url}')
349
508
 
350
509
  # we now have a home directory and db; start other services
351
510
  self._set_up_runtime()
352
511
  self.log_to_stdout(False)
353
512
 
354
- def _create_engine(self, time_zone_name: Optional[str], echo: bool = False) -> None:
355
- connect_args = {} if time_zone_name is None else {'options': f'-c timezone={time_zone_name}'}
513
+ def _init_db(self, config: Config) -> None:
514
+ """
515
+ Initialize the pixeltable database along with its associated DBMS.
516
+ """
517
+ db_connect_str = config.get_string_value('DB_CONNECT_STR')
518
+ if db_connect_str is not None:
519
+ try:
520
+ db_url = sql.make_url(db_connect_str)
521
+ except sql.exc.ArgumentError as e:
522
+ error = f'Invalid db connection string {db_connect_str}: {e}'
523
+ self._logger.error(error)
524
+ raise excs.Error(error) from e
525
+ self._db_url = db_url.render_as_string(hide_password=False)
526
+ self._db_name = db_url.database # use the dbname given in connect string
527
+ dialect = db_url.get_dialect().name
528
+ if dialect == 'cockroachdb':
529
+ self._dbms = CockroachDbms(db_url)
530
+ else:
531
+ raise excs.Error(f'Unsupported DBMS {dialect}')
532
+ # Check if database exists
533
+ if not self._store_db_exists():
534
+ error = f'Database {self._db_name!r} does not exist'
535
+ self._logger.error(error)
536
+ raise excs.Error(error)
537
+ self._logger.info(f'Using database at: {self.db_url}')
538
+ else:
539
+ self._db_name = config.get_string_value('db') or 'pixeltable'
540
+ self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(Config.get().home / 'pgdata')))
541
+ # cleanup_mode=None will leave the postgres process running after Python exits
542
+ # cleanup_mode='stop' will terminate the postgres process when Python exits
543
+ # On Windows, we need cleanup_mode='stop' because child processes are killed automatically when the parent
544
+ # process (such as Terminal or VSCode) exits, potentially leaving it in an unusable state.
545
+ cleanup_mode = 'stop' if platform.system() == 'Windows' else None
546
+ self._db_server = pixeltable_pgserver.get_server(self._pgdata_dir, cleanup_mode=cleanup_mode)
547
+ self._db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
548
+ self._dbms = PostgresqlDbms(sql.make_url(self._db_url))
549
+ assert self._dbms is not None
550
+ assert self._db_url is not None
551
+ assert self._db_name is not None
552
+
553
+ @retry(
554
+ stop=stop_after_attempt(3), # Stop after 3 attempts
555
+ wait=wait_exponential_jitter(initial=0.2, max=1.0, jitter=0.2), # Exponential backoff with jitter
556
+ )
557
+ def _init_metadata(self) -> None:
558
+ """
559
+ Create pixeltable metadata tables and system metadata.
560
+ This is an idempotent operation.
561
+
562
+ Retry logic handles race conditions when multiple Pixeltable processes
563
+ attempt to initialize metadata tables simultaneously. The first process may succeed
564
+ in creating tables while others encounter database constraints (e.g., "table already exists").
565
+ Exponential backoff with jitter reduces contention between competing processes.
566
+ """
567
+ assert self._sa_engine is not None
568
+ from pixeltable import metadata
569
+
570
+ self._logger.debug('Creating pixeltable metadata')
571
+ metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
572
+ metadata.create_system_info(self._sa_engine)
573
+
574
+ def _create_engine(self, time_zone_name: str, echo: bool = False) -> None:
575
+ # Add timezone option to connection string
576
+ updated_url = add_option_to_db_url(self.db_url, f'-c timezone={time_zone_name}')
577
+
356
578
  self._sa_engine = sql.create_engine(
357
- self.db_url,
358
- echo=echo,
359
- future=True,
360
- isolation_level='AUTOCOMMIT',
361
- connect_args=connect_args,
579
+ updated_url, echo=echo, isolation_level=self._dbms.transaction_isolation_level
362
580
  )
581
+
363
582
  self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
583
+ self._logger.info(f'Engine dialect: {self._sa_engine.dialect.name}')
584
+ self._logger.info(f'Engine driver : {self._sa_engine.dialect.driver}')
585
+
364
586
  with self.engine.begin() as conn:
365
587
  tz_name = conn.execute(sql.text('SHOW TIME ZONE')).scalar()
366
588
  assert isinstance(tz_name, str)
367
589
  self._logger.info(f'Database time zone is now: {tz_name}')
368
590
  self._default_time_zone = ZoneInfo(tz_name)
591
+ if self.is_using_cockroachdb:
592
+ # This could be set when the database is created, but we set it now
593
+ conn.execute(sql.text('SET null_ordered_last = true;'))
594
+ null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
595
+ assert isinstance(null_ordered_last, str)
596
+ self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
369
597
 
370
598
  def _store_db_exists(self) -> bool:
371
599
  assert self._db_name is not None
372
600
  # don't try to connect to self.db_name, it may not exist
373
- db_url = self._db_server.get_uri(database='postgres', driver='psycopg')
374
- engine = sql.create_engine(db_url, future=True)
601
+ engine = sql.create_engine(self._dbms.default_system_db_url(), future=True)
375
602
  try:
376
603
  with engine.begin() as conn:
377
604
  stmt = f"SELECT COUNT(*) FROM pg_database WHERE datname = '{self._db_name}'"
@@ -384,53 +611,55 @@ class Env:
384
611
  def _create_store_db(self) -> None:
385
612
  assert self._db_name is not None
386
613
  # create the db
387
- pg_db_url = self._db_server.get_uri(database='postgres', driver='psycopg')
388
- engine = sql.create_engine(pg_db_url, future=True, isolation_level='AUTOCOMMIT')
614
+ engine = sql.create_engine(self._dbms.default_system_db_url(), future=True, isolation_level='AUTOCOMMIT')
389
615
  preparer = engine.dialect.identifier_preparer
390
616
  try:
391
617
  with engine.begin() as conn:
392
- # use C collation to get standard C/Python-style sorting
393
- stmt = (
394
- f"CREATE DATABASE {preparer.quote(self._db_name)} "
395
- "ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
396
- )
618
+ stmt = self._dbms.create_db_stmt(preparer.quote(self._db_name))
397
619
  conn.execute(sql.text(stmt))
398
620
  finally:
399
621
  engine.dispose()
400
622
 
401
623
  # enable pgvector
402
- store_db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
403
- engine = sql.create_engine(store_db_url, future=True, isolation_level='AUTOCOMMIT')
624
+ engine = sql.create_engine(self.db_url, future=True, isolation_level='AUTOCOMMIT')
404
625
  try:
405
626
  with engine.begin() as conn:
406
627
  conn.execute(sql.text('CREATE EXTENSION vector'))
407
628
  finally:
408
629
  engine.dispose()
409
630
 
631
+ def _pgserver_terminate_connections_stmt(self) -> str:
632
+ return f"""
633
+ SELECT pg_terminate_backend(pg_stat_activity.pid)
634
+ FROM pg_stat_activity
635
+ WHERE pg_stat_activity.datname = '{self._db_name}'
636
+ AND pid <> pg_backend_pid()
637
+ """
638
+
410
639
  def _drop_store_db(self) -> None:
411
640
  assert self._db_name is not None
412
- db_url = self._db_server.get_uri(database='postgres', driver='psycopg')
413
- engine = sql.create_engine(db_url, future=True, isolation_level='AUTOCOMMIT')
641
+ engine = sql.create_engine(self._dbms.default_system_db_url(), future=True, isolation_level='AUTOCOMMIT')
414
642
  preparer = engine.dialect.identifier_preparer
415
643
  try:
416
644
  with engine.begin() as conn:
417
645
  # terminate active connections
418
- stmt = (f"""
419
- SELECT pg_terminate_backend(pg_stat_activity.pid)
420
- FROM pg_stat_activity
421
- WHERE pg_stat_activity.datname = '{self._db_name}'
422
- AND pid <> pg_backend_pid()
423
- """)
424
- conn.execute(sql.text(stmt))
646
+ if self._db_server is not None:
647
+ conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
425
648
  # drop db
426
- stmt = f'DROP DATABASE {preparer.quote(self._db_name)}'
649
+ stmt = self._dbms.drop_db_stmt(preparer.quote(self._db_name))
427
650
  conn.execute(sql.text(stmt))
428
651
  finally:
429
652
  engine.dispose()
430
653
 
431
654
  def _upgrade_metadata(self) -> None:
655
+ from pixeltable import metadata
656
+
432
657
  metadata.upgrade_md(self._sa_engine)
433
658
 
659
+ @property
660
+ def pxt_api_key(self) -> str | None:
661
+ return self._pxt_api_key
662
+
434
663
  def get_client(self, name: str) -> Any:
435
664
  """
436
665
  Gets the client with the specified name, initializing it if necessary.
@@ -438,35 +667,51 @@ class Env:
438
667
  Args:
439
668
  - name: The name of the client
440
669
  """
441
- cl = _registered_clients[name]
442
- if cl.client_obj is not None:
443
- return cl.client_obj # Already initialized
444
-
445
- # Construct a client, retrieving each parameter from config.
446
-
447
- init_kwargs: dict[str, str] = {}
448
- for param in cl.param_names:
449
- arg = self._config.get_string_value(param, section=name)
450
- if arg is not None and len(arg) > 0:
451
- init_kwargs[param] = arg
452
- else:
670
+ # Return the existing client if it has already been constructed
671
+ with _registered_clients_lock:
672
+ cl = _registered_clients[name]
673
+ if cl.client_obj is not None:
674
+ return cl.client_obj # Already initialized
675
+
676
+ # Retrieve parameters required to construct the requested client.
677
+ init_kwargs: dict[str, Any] = {}
678
+ for param in cl.params.values():
679
+ # Determine the type of the parameter for proper config parsing.
680
+ pname = param.name
681
+ t = param.annotation
682
+ # Deference T | None
683
+ if typing.get_origin(t) in (typing.Union, types.UnionType):
684
+ args = typing.get_args(t)
685
+ if args[0] is type(None):
686
+ t = args[1]
687
+ elif args[1] is type(None):
688
+ t = args[0]
689
+ assert isinstance(t, type), t
690
+ arg: Any = Config.get().get_value(pname, t, section=name)
691
+ if arg is not None:
692
+ init_kwargs[pname] = arg
693
+ elif param.default is inspect.Parameter.empty:
453
694
  raise excs.Error(
454
- f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
455
- f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, or put `{param.lower()}` in '
456
- f'the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
695
+ f'`{name}` client not initialized: parameter `{pname}` is not configured.\n'
696
+ f'To fix this, specify the `{name.upper()}_{pname.upper()}` environment variable, '
697
+ f'or put `{pname.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
457
698
  )
458
699
 
459
- cl.client_obj = cl.init_fn(**init_kwargs)
460
- self._logger.info(f'Initialized `{name}` client.')
461
- return cl.client_obj
700
+ # Construct the requested client
701
+ with _registered_clients_lock:
702
+ if cl.client_obj is not None:
703
+ return cl.client_obj # Already initialized
704
+ cl.client_obj = cl.init_fn(**init_kwargs)
705
+ self._logger.info(f'Initialized `{name}` client with parameters: {init_kwargs}.')
706
+ return cl.client_obj
462
707
 
463
708
  def _start_web_server(self) -> None:
464
709
  """
465
710
  The http server root is the file system root.
466
711
  eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
467
- in windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
468
- This arrangement enables serving media hosted within _home,
469
- as well as external media inserted into pixeltable or produced by pixeltable.
712
+ On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
713
+ This arrangement enables serving objects hosted within _home,
714
+ as well as external objects inserted into pixeltable or produced by pixeltable.
470
715
  The port is chosen dynamically to prevent conflicts.
471
716
  """
472
717
  # Port 0 means OS picks one for us.
@@ -474,7 +719,7 @@ class Env:
474
719
  port = self._httpd.server_address[1]
475
720
  self._http_address = f'http://127.0.0.1:{port}'
476
721
 
477
- def run_server():
722
+ def run_server() -> None:
478
723
  logging.log(logging.INFO, f'running web server at {self._http_address}')
479
724
  self._httpd.serve_forever()
480
725
 
@@ -484,30 +729,77 @@ class Env:
484
729
 
485
730
  def _set_up_runtime(self) -> None:
486
731
  """Check for and start runtime services"""
732
+ register_heif_opener()
487
733
  self._start_web_server()
488
734
  self.__register_packages()
489
- if self.is_installed_package('spacy'):
490
- self.__init_spacy()
735
+
736
+ @property
737
+ def default_video_encoder(self) -> str | None:
738
+ if self._default_video_encoder is None:
739
+ self._default_video_encoder = self._determine_default_video_encoder()
740
+ return self._default_video_encoder
741
+
742
+ def _determine_default_video_encoder(self) -> str | None:
743
+ """
744
+ Returns the first available encoder from a list of candidates.
745
+
746
+ TODO:
747
+ - the user might prefer a hardware-accelerated encoder (eg, h264_nvenc or h264_videotoolbox)
748
+ - allow user override via a config option 'video_encoder'
749
+ """
750
+ # look for available encoders, in this order
751
+ candidates = [
752
+ 'libx264', # GPL, best quality
753
+ 'libopenh264', # BSD
754
+ ]
755
+
756
+ try:
757
+ # Get list of available encoders
758
+ result = subprocess.run(['ffmpeg', '-encoders'], capture_output=True, text=True, timeout=10, check=True)
759
+
760
+ if result.returncode == 0:
761
+ available_encoders = result.stdout
762
+ for encoder in candidates:
763
+ # ffmpeg -encoders output format: " V..... encoder_name description"
764
+ if f' {encoder} ' in available_encoders:
765
+ _logger.debug(f'Using H.264 encoder: {encoder}')
766
+ return encoder
767
+ except Exception:
768
+ pass
769
+ return None
491
770
 
492
771
  def __register_packages(self) -> None:
493
772
  """Declare optional packages that are utilized by some parts of the code."""
773
+ self.__register_package('accelerate')
494
774
  self.__register_package('anthropic')
775
+ self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
495
776
  self.__register_package('boto3')
496
777
  self.__register_package('datasets')
778
+ self.__register_package('diffusers')
497
779
  self.__register_package('fiftyone')
780
+ self.__register_package('twelvelabs')
781
+ self.__register_package('fal_client', library_name='fal-client')
498
782
  self.__register_package('fireworks', library_name='fireworks-ai')
783
+ self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
784
+ self.__register_package('google.genai', library_name='google-genai')
785
+ self.__register_package('groq')
499
786
  self.__register_package('huggingface_hub', library_name='huggingface-hub')
500
787
  self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
788
+ self.__register_package('librosa')
501
789
  self.__register_package('llama_cpp', library_name='llama-cpp-python')
790
+ self.__register_package('mcp')
502
791
  self.__register_package('mistralai')
503
792
  self.__register_package('mistune')
504
793
  self.__register_package('ollama')
505
794
  self.__register_package('openai')
506
795
  self.__register_package('openpyxl')
507
796
  self.__register_package('pyarrow')
797
+ self.__register_package('pydantic')
508
798
  self.__register_package('replicate')
799
+ self.__register_package('reve')
509
800
  self.__register_package('sentencepiece')
510
801
  self.__register_package('sentence_transformers', library_name='sentence-transformers')
802
+ self.__register_package('soundfile')
511
803
  self.__register_package('spacy')
512
804
  self.__register_package('tiktoken')
513
805
  self.__register_package('together')
@@ -515,17 +807,30 @@ class Env:
515
807
  self.__register_package('torchaudio')
516
808
  self.__register_package('torchvision')
517
809
  self.__register_package('transformers')
810
+ self.__register_package('voyageai')
518
811
  self.__register_package('whisper', library_name='openai-whisper')
519
812
  self.__register_package('whisperx')
520
- self.__register_package('yolox', library_name='git+https://github.com/Megvii-BaseDetection/YOLOX@ac58e0a')
813
+ self.__register_package('yolox', library_name='pixeltable-yolox')
814
+ self.__register_package('lancedb')
815
+ self.__register_package('scenedetect')
521
816
 
522
- def __register_package(self, package_name: str, library_name: Optional[str] = None) -> None:
817
+ def __register_package(self, package_name: str, library_name: str | None = None) -> None:
818
+ is_installed: bool
819
+ try:
820
+ is_installed = importlib.util.find_spec(package_name) is not None
821
+ except ModuleNotFoundError:
822
+ # This can happen if the parent of `package_name` is not installed.
823
+ is_installed = False
523
824
  self.__optional_packages[package_name] = PackageInfo(
524
- is_installed=importlib.util.find_spec(package_name) is not None,
525
- library_name=library_name or package_name # defaults to package_name unless specified otherwise
825
+ is_installed=is_installed,
826
+ library_name=library_name or package_name, # defaults to package_name unless specified otherwise
526
827
  )
527
828
 
528
- def require_package(self, package_name: str, min_version: Optional[list[int]] = None) -> None:
829
+ def require_binary(self, binary_name: str) -> None:
830
+ if not shutil.which(binary_name):
831
+ raise excs.Error(f'{binary_name} is not installed or not in PATH. Please install it to use this feature.')
832
+
833
+ def require_package(self, package_name: str, min_version: list[int] | None = None) -> None:
529
834
  """
530
835
  Checks whether the specified optional package is available. If not, raises an exception
531
836
  with an error message informing the user how to install it.
@@ -542,7 +847,8 @@ class Env:
542
847
  if not package_info.is_installed:
543
848
  # Still not found.
544
849
  raise excs.Error(
545
- f'This feature requires the `{package_name}` package. To install it, run: `pip install -U {package_info.library_name}`'
850
+ f'This feature requires the `{package_name}` package. To install it, run: '
851
+ f'`pip install -U {package_info.library_name}`'
546
852
  )
547
853
 
548
854
  if min_version is None:
@@ -555,56 +861,41 @@ class Env:
555
861
 
556
862
  if min_version > package_info.version:
557
863
  raise excs.Error(
558
- f'The installed version of package `{package_name}` is {".".join(str(v) for v in package_info.version)}, '
864
+ f'The installed version of package `{package_name}` is '
865
+ f'{".".join(str(v) for v in package_info.version)}, '
559
866
  f'but version >={".".join(str(v) for v in min_version)} is required. '
560
867
  f'To fix this, run: `pip install -U {package_info.library_name}`'
561
868
  )
562
869
 
563
- def __init_spacy(self) -> None:
564
- """
565
- spaCy relies on a pip-installed model to operate. In order to avoid requiring the model as a separate
566
- dependency, we install it programmatically here. This should cause no problems, since the model packages
567
- have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
568
- """
569
- import spacy
570
- from spacy.cli.download import get_model_filename
571
- spacy_model = 'en_core_web_sm'
572
- spacy_model_version = '3.7.1'
573
- filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
574
- url = f'{spacy.about.__download_url__}/{filename}'
575
- # Try to `pip install` the model. We set check=False; if the pip command fails, it's not necessarily
576
- # a problem, because the model have been installed on a previous attempt.
577
- self._logger.info(f'Ensuring spaCy model is installed: {filename}')
578
- ret = subprocess.run([sys.executable, '-m', 'pip', 'install', '-qU', url], check=False)
579
- if ret.returncode != 0:
580
- self._logger.warn(f'pip install failed for spaCy model: {filename}')
581
- try:
582
- self._logger.info(f'Loading spaCy model: {spacy_model}')
583
- self._spacy_nlp = spacy.load(spacy_model)
584
- except Exception as exc:
585
- self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
586
- warnings.warn(
587
- f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
588
- excs.PixeltableWarning
589
- )
590
- self.__optional_packages['spacy'].is_installed = False
591
-
592
- def num_tmp_files(self) -> int:
593
- return len(glob.glob(f'{self._tmp_dir}/*'))
594
-
595
- def create_tmp_path(self, extension: str = '') -> Path:
596
- return self._tmp_dir / f'{uuid.uuid4()}{extension}'
870
+ def clear_tmp_dir(self) -> None:
871
+ for path in glob.glob(f'{self._tmp_dir}/*'):
872
+ if os.path.isdir(path):
873
+ shutil.rmtree(path)
874
+ else:
875
+ os.remove(path)
597
876
 
598
- @property
599
- def home(self) -> Path:
600
- assert self._home is not None
601
- return self._home
877
+ # def get_resource_pool_info(self, pool_id: str, pool_info_cls: Type[T] | None) -> T:
878
+ def get_resource_pool_info(self, pool_id: str, make_pool_info: Callable[[], T] | None = None) -> T:
879
+ """Returns the info object for the given id, creating it if necessary."""
880
+ info = self._resource_pool_info.get(pool_id)
881
+ if info is None and make_pool_info is not None:
882
+ info = make_pool_info()
883
+ self._resource_pool_info[pool_id] = info
884
+ return info
602
885
 
603
886
  @property
604
887
  def media_dir(self) -> Path:
605
888
  assert self._media_dir is not None
606
889
  return self._media_dir
607
890
 
891
+ @property
892
+ def default_input_media_dest(self) -> str | None:
893
+ return self._default_input_media_dest
894
+
895
+ @property
896
+ def default_output_media_dest(self) -> str | None:
897
+ return self._default_output_media_dest
898
+
608
899
  @property
609
900
  def file_cache_dir(self) -> Path:
610
901
  assert self._file_cache_dir is not None
@@ -628,9 +919,86 @@ class Env:
628
919
  @property
629
920
  def spacy_nlp(self) -> spacy.Language:
630
921
  Env.get().require_package('spacy')
922
+ if self._spacy_nlp is None:
923
+ self.__init_spacy()
631
924
  assert self._spacy_nlp is not None
632
925
  return self._spacy_nlp
633
926
 
927
+ def __init_spacy(self) -> None:
928
+ """
929
+ spaCy relies on a pip-installed model to operate. In order to avoid requiring the model as a separate
930
+ dependency, we install it programmatically here. This should cause no problems, since the model packages
931
+ have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
932
+ """
933
+ import spacy
934
+ from spacy.cli.download import download
935
+
936
+ spacy_model = 'en_core_web_sm'
937
+ self._logger.info(f'Ensuring spaCy model is installed: {spacy_model}')
938
+ download(spacy_model)
939
+ self._logger.info(f'Loading spaCy model: {spacy_model}')
940
+ try:
941
+ self._spacy_nlp = spacy.load(spacy_model)
942
+ except Exception as exc:
943
+ raise excs.Error(f'Failed to load spaCy model: {spacy_model}') from exc
944
+
945
+ def _clean_up(self) -> None:
946
+ """
947
+ Internal cleanup method that properly closes all resources and resets state.
948
+ This is called before destroying the singleton instance.
949
+ """
950
+ assert self._current_session is None
951
+ assert self._current_conn is None
952
+
953
+ # Stop HTTP server
954
+ if self._httpd is not None:
955
+ try:
956
+ self._httpd.shutdown()
957
+ self._httpd.server_close()
958
+ except Exception as e:
959
+ _logger.warning(f'Error stopping HTTP server: {e}')
960
+
961
+ # First terminate all connections to the database
962
+ if self._db_server is not None:
963
+ assert self._dbms is not None
964
+ assert self._db_name is not None
965
+ try:
966
+ temp_engine = sql.create_engine(self._dbms.default_system_db_url(), isolation_level='AUTOCOMMIT')
967
+ try:
968
+ with temp_engine.begin() as conn:
969
+ conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
970
+ _logger.info(f"Terminated all connections to database '{self._db_name}'")
971
+ except Exception as e:
972
+ _logger.warning(f'Error terminating database connections: {e}')
973
+ finally:
974
+ temp_engine.dispose()
975
+ except Exception as e:
976
+ _logger.warning(f'Error stopping database server: {e}')
977
+
978
+ # Dispose of SQLAlchemy engine (after stopping db server)
979
+ if self._sa_engine is not None:
980
+ try:
981
+ self._sa_engine.dispose()
982
+ except Exception as e:
983
+ _logger.warning(f'Error disposing engine: {e}')
984
+
985
+ # Close event loop
986
+ if self._event_loop is not None:
987
+ try:
988
+ if self._event_loop.is_running():
989
+ self._event_loop.stop()
990
+ self._event_loop.close()
991
+ except Exception as e:
992
+ _logger.warning(f'Error closing event loop: {e}')
993
+
994
+ # Remove logging handlers
995
+ for handler in self._logger.handlers[:]:
996
+ try:
997
+ handler.close()
998
+ self._logger.removeHandler(handler)
999
+ except Exception as e:
1000
+ _logger.warning(f'Error removing handler: {e}')
1001
+
634
1002
 
635
1003
  def register_client(name: str) -> Callable:
636
1004
  """Decorator that registers a third-party API client for use by Pixeltable.
@@ -656,100 +1024,183 @@ def register_client(name: str) -> Callable:
656
1024
  Args:
657
1025
  - name (str): The name of the API client (e.g., 'openai' or 'label-studio').
658
1026
  """
1027
+
659
1028
  def decorator(fn: Callable) -> None:
660
- global _registered_clients
661
1029
  sig = inspect.signature(fn)
662
- param_names = list(sig.parameters.keys())
663
- _registered_clients[name] = ApiClient(init_fn=fn, param_names=param_names)
1030
+ params = dict(sig.parameters)
1031
+ with _registered_clients_lock:
1032
+ _registered_clients[name] = ApiClient(init_fn=fn, params=params)
664
1033
 
665
1034
  return decorator
666
1035
 
667
1036
 
668
- class Config:
1037
+ _registered_clients_lock: threading.Lock = threading.Lock()
1038
+ _registered_clients: dict[str, ApiClient] = {}
1039
+
1040
+
1041
+ @dataclass
1042
+ class ApiClient:
1043
+ init_fn: Callable
1044
+ params: dict[str, inspect.Parameter]
1045
+ client_obj: Any | None = None
1046
+
1047
+
1048
+ @dataclass
1049
+ class PackageInfo:
1050
+ is_installed: bool
1051
+ library_name: str # pypi library name (may be different from package name)
1052
+ version: list[int] | None = None # installed version, as a list of components (such as [3,0,2] for "3.0.2")
1053
+
1054
+
1055
+ TIME_FORMAT = '%H:%M.%S %f'
1056
+ # As far as rate limiting goes, we try not go lower than 5% of the capacity because we don't have perfect information
1057
+ # about the rate limits and the usage
1058
+ TARGET_RATE_LIMIT_RESOURCE_FRACT = 0.05
1059
+
1060
+
1061
+ @dataclass
1062
+ class RateLimitsInfo:
669
1063
  """
670
- The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
671
- configuration values, which can be set in the config file or as environment variables.
1064
+ Abstract base class for resource pools made up of rate limits for different resources.
1065
+
1066
+ Rate limits and currently remaining resources are periodically reported via record().
1067
+
1068
+ Subclasses provide operational customization via:
1069
+ - get_retry_delay()
1070
+ - get_request_resources(self, ...) -> dict[str, int]
1071
+ with parameters that are a subset of those of the udf that creates the subclass's instance
672
1072
  """
673
- __config: dict[str, Any]
674
1073
 
675
- T = TypeVar('T')
1074
+ # get_request_resources:
1075
+ # - Returns estimated resources needed for a specific request (ie, a single udf call) as a dict (key: resource name)
1076
+ # - parameters are a subset of those of the udf
1077
+ # - this is not a class method because the signature depends on the instantiating udf
1078
+ get_request_resources: Callable[..., dict[str, int]]
676
1079
 
677
- @classmethod
678
- def from_file(cls, path: Path) -> Config:
679
- """
680
- Loads configuration from the specified TOML file. If the file does not exist, it will be
681
- created and populated with the default configuration.
682
- """
683
- if os.path.isfile(path):
684
- with open(path, 'r') as stream:
685
- try:
686
- config_dict = toml.load(stream)
687
- except Exception as exc:
688
- raise excs.Error(f'Could not read config file: {str(path)}') from exc
689
- else:
690
- config_dict = cls.__create_default_config(path)
691
- with open(path, 'w') as stream:
692
- try:
693
- toml.dump(config_dict, stream)
694
- except Exception as exc:
695
- raise excs.Error(f'Could not write config file: {str(path)}') from exc
696
- logging.getLogger('pixeltable').info(f'Created default config file at: {str(path)}')
697
- return cls(config_dict)
1080
+ resource_limits: dict[str, RateLimitInfo] = field(default_factory=dict)
1081
+ has_exc: bool = False
698
1082
 
699
- @classmethod
700
- def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
701
- free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
702
- # Default cache size is 1/5 of free disk space
703
- file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
704
- return {
705
- 'pixeltable': {
706
- 'file_cache_size_g': round(file_cache_size_g, 1),
707
- 'hide_warnings': False,
708
- }
709
- }
710
-
711
- def __init__(self, config: dict[str, Any]) -> None:
712
- self.__config = config
713
-
714
- def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
715
- env_var = f'{section.upper()}_{key.upper()}'
716
- if env_var in os.environ:
717
- value = os.environ[env_var]
718
- elif section in self.__config and key in self.__config[section]:
719
- value = self.__config[section][key]
720
- else:
721
- return None
1083
+ def debug_str(self) -> str:
1084
+ return ','.join(info.debug_str() for info in self.resource_limits.values())
722
1085
 
723
- try:
724
- return expected_type(value) # type: ignore[call-arg]
725
- except ValueError:
726
- raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
1086
+ def is_initialized(self) -> bool:
1087
+ return len(self.resource_limits) > 0
727
1088
 
728
- def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
729
- return self.get_value(key, str, section)
1089
+ def reset(self) -> None:
1090
+ self.resource_limits.clear()
730
1091
 
731
- def get_int_value(self, key: str, section: str = 'pixeltable') -> Optional[int]:
732
- return self.get_value(key, int, section)
1092
+ def record(self, request_ts: datetime.datetime, reset_exc: bool = False, **kwargs: Any) -> None:
1093
+ """Update self.resource_limits with the provided rate limit info.
1094
+ Args:
1095
+ - request_ts: time at which the request was made
1096
+ - reset_exc: if True, reset the has_exc flag
1097
+ """
1098
+ if len(self.resource_limits) == 0:
1099
+ self.resource_limits = {k: RateLimitInfo(k, request_ts, *v) for k, v in kwargs.items() if v is not None}
1100
+ # TODO: remove
1101
+ for info in self.resource_limits.values():
1102
+ _logger.debug(f'Updated resource state: {info}')
1103
+ else:
1104
+ if self.has_exc and not reset_exc:
1105
+ # ignore updates until we're asked to reset
1106
+ _logger.debug(f'rate_limits.record(): ignoring update {kwargs}')
1107
+ return
1108
+ self.has_exc = False
1109
+ for k, v in kwargs.items():
1110
+ if v is not None:
1111
+ self.resource_limits[k].update(request_ts, *v)
1112
+ _logger.debug(f'Updated resource state: {self.resource_limits[k]}')
1113
+
1114
+ def record_exc(self, request_ts: datetime.datetime, exc: Exception) -> None:
1115
+ """Update self.resource_limits based on the exception headers
1116
+ Args:
1117
+ - request_ts: time at which the request that caused the exception was made
1118
+ - exc: the exception raised"""
1119
+ self.has_exc = True
1120
+
1121
+ def get_retry_delay(self, exc: Exception, attempt: int) -> float | None:
1122
+ """Returns number of seconds to wait before retry, or None if not retryable"""
1123
+ # Find the highest wait until at least 5% availability of all resources
1124
+ max_wait = 0.0
1125
+ for limit_info in self.resource_limits.values():
1126
+ time_until = limit_info.estimated_resource_refill_delay(
1127
+ math.ceil(TARGET_RATE_LIMIT_RESOURCE_FRACT * limit_info.limit)
1128
+ )
1129
+ if time_until is not None:
1130
+ max_wait = max(max_wait, time_until)
1131
+ return max_wait if max_wait > 0 else None
733
1132
 
734
- def get_float_value(self, key: str, section: str = 'pixeltable') -> Optional[float]:
735
- return self.get_value(key, float, section)
736
1133
 
737
- def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
738
- return self.get_value(key, bool, section)
1134
+ @dataclass
1135
+ class RateLimitInfo:
1136
+ """Container for rate limit-related information for a single resource."""
1137
+
1138
+ resource: str
1139
+ request_start_ts: datetime.datetime
1140
+ limit: int
1141
+ remaining: int
1142
+ reset_at: datetime.datetime
1143
+
1144
+ def debug_str(self) -> str:
1145
+ return (
1146
+ f'{self.resource}@{self.request_start_ts.strftime(TIME_FORMAT)}: '
1147
+ f'{self.limit}/{self.remaining}/{self.reset_at.strftime(TIME_FORMAT)}'
1148
+ )
739
1149
 
1150
+ def update(
1151
+ self, request_start_ts: datetime.datetime, limit: int, remaining: int, reset_at: datetime.datetime
1152
+ ) -> None:
1153
+ # Responses can come out of order, especially for failed requests. We need to be careful not to overwrite
1154
+ # the current state with less up-to-date information. We use request_start_ts as a proxy for rate limit info
1155
+ # recency.
1156
+ if self.request_start_ts > request_start_ts:
1157
+ # The current state is more up-to-date than the update
1158
+ _logger.debug(
1159
+ f'Ignoring out-of-date update for {self.resource}. Current request_start_ts: '
1160
+ f'{self.request_start_ts}, update: {request_start_ts}'
1161
+ )
1162
+ return
1163
+ self.request_start_ts = request_start_ts
1164
+ self.limit = limit
1165
+ self.remaining = remaining
1166
+ self.reset_at = reset_at
1167
+
1168
+ def estimated_resource_refill_delay(self, target_remaining: int) -> float | None:
1169
+ """Estimate time in seconds until remaining resources reaches target_remaining.
1170
+ Assumes linear replenishment of resources over time.
1171
+ Returns None if unable to estimate.
1172
+ """
1173
+ if self.remaining >= target_remaining:
1174
+ return 0
1175
+ if self.request_start_ts >= self.reset_at:
1176
+ return 0
1177
+ if self.limit < target_remaining:
1178
+ return None
740
1179
 
741
- _registered_clients: dict[str, ApiClient] = {}
1180
+ # Estimate resource refill rate based on the recorded state and timestamps. Assumes linear refill.
1181
+ refill_rate = (self.limit - self.remaining) / (self.reset_at - self.request_start_ts).total_seconds()
1182
+ assert refill_rate > 0, f'self={self}, target_remaining={target_remaining}'
742
1183
 
1184
+ now = datetime.datetime.now(tz=datetime.timezone.utc)
1185
+ time_until = (target_remaining - self.remaining) / refill_rate - (now - self.request_start_ts).total_seconds()
1186
+ return max(0, math.ceil(time_until))
743
1187
 
744
- @dataclass
745
- class ApiClient:
746
- init_fn: Callable
747
- param_names: list[str]
748
- client_obj: Optional[Any] = None
1188
+ def __repr__(self) -> str:
1189
+ return (
1190
+ f'RateLimitInfo(resource={self.resource}, request_start_ts={self.request_start_ts}, '
1191
+ f'remaining={self.remaining}/{self.limit} ({(100 * self.remaining / self.limit):.1f}%), '
1192
+ f'reset_at={self.reset_at})'
1193
+ )
749
1194
 
750
1195
 
751
1196
  @dataclass
752
- class PackageInfo:
753
- is_installed: bool
754
- library_name: str # pypi library name (may be different from package name)
755
- version: Optional[list[int]] = None # installed version, as a list of components (such as [3,0,2] for "3.0.2")
1197
+ class RuntimeCtx:
1198
+ """
1199
+ Container for runtime data provided by the execution system to udfs.
1200
+
1201
+ Udfs that accept the special _runtime_ctx parameter receive an instance of this class.
1202
+ """
1203
+
1204
+ # Indicates a retry attempt following a rate limit error (error code: 429). Requires a 'rate-limits' resource pool.
1205
+ # If True, call RateLimitsInfo.record() with reset_exc=True.
1206
+ is_retry: bool = False