pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/env.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  import datetime
4
5
  import glob
5
6
  import http.server
@@ -7,24 +8,30 @@ import importlib
7
8
  import importlib.util
8
9
  import inspect
9
10
  import logging
11
+ import math
10
12
  import os
11
13
  import platform
12
14
  import shutil
13
15
  import subprocess
14
16
  import sys
15
17
  import threading
16
- import uuid
18
+ import types
19
+ import typing
17
20
  import warnings
18
- from abc import abstractmethod
19
21
  from contextlib import contextmanager
20
22
  from dataclasses import dataclass, field
21
23
  from pathlib import Path
22
24
  from sys import stdout
23
- from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
25
+ from typing import TYPE_CHECKING, Any, Callable, Iterator, TypeVar
24
26
  from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
25
27
 
28
+ import nest_asyncio # type: ignore[import-untyped]
26
29
  import pixeltable_pgserver
27
30
  import sqlalchemy as sql
31
+ import tzlocal
32
+ from pillow_heif import register_heif_opener # type: ignore[import-untyped]
33
+ from sqlalchemy import orm
34
+ from tenacity import retry, stop_after_attempt, wait_exponential_jitter
28
35
  from tqdm import TqdmWarning
29
36
 
30
37
  from pixeltable import exceptions as excs
@@ -32,6 +39,8 @@ from pixeltable.config import Config
32
39
  from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
33
40
  from pixeltable.utils.dbms import CockroachDbms, Dbms, PostgresqlDbms
34
41
  from pixeltable.utils.http_server import make_server
42
+ from pixeltable.utils.object_stores import ObjectPath
43
+ from pixeltable.utils.sql import add_option_to_db_url
35
44
 
36
45
  if TYPE_CHECKING:
37
46
  import spacy
@@ -49,42 +58,50 @@ class Env:
49
58
  For a non-local environment, Pixeltable uses a connection string to the externally managed database.
50
59
  """
51
60
 
52
- _instance: Optional[Env] = None
61
+ SERIALIZABLE_ISOLATION_LEVEL = 'SERIALIZABLE'
62
+
63
+ _instance: Env | None = None
53
64
  __initializing: bool = False
54
65
  _log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
55
66
 
56
- _media_dir: Optional[Path]
57
- _file_cache_dir: Optional[Path] # cached media files with external URL
58
- _dataset_cache_dir: Optional[Path] # cached datasets (eg, pytorch or COCO)
59
- _log_dir: Optional[Path] # log files
60
- _tmp_dir: Optional[Path] # any tmp files
61
- _sa_engine: Optional[sql.engine.base.Engine]
62
- _pgdata_dir: Optional[Path]
63
- _db_name: Optional[str]
64
- _db_server: Optional[pixeltable_pgserver.PostgresServer] # set only when running in local environment
65
- _db_url: Optional[str]
66
- _default_time_zone: Optional[ZoneInfo]
67
+ _media_dir: Path | None
68
+ _file_cache_dir: Path | None # cached object files with external URL
69
+ _dataset_cache_dir: Path | None # cached datasets (eg, pytorch or COCO)
70
+ _log_dir: Path | None # log files
71
+ _tmp_dir: Path | None # any tmp files
72
+ _sa_engine: sql.engine.base.Engine | None
73
+ _pgdata_dir: Path | None
74
+ _db_name: str | None
75
+ _db_server: pixeltable_pgserver.PostgresServer | None # set only when running in local environment
76
+ _db_url: str | None
77
+ _default_time_zone: ZoneInfo | None
78
+ _verbosity: int
67
79
 
68
80
  # info about optional packages that are utilized by some parts of the code
69
81
  __optional_packages: dict[str, PackageInfo]
70
82
 
71
- _spacy_nlp: Optional[spacy.Language]
72
- _httpd: Optional[http.server.HTTPServer]
73
- _http_address: Optional[str]
83
+ _spacy_nlp: spacy.Language | None
84
+ _httpd: http.server.HTTPServer | None
85
+ _http_address: str | None
74
86
  _logger: logging.Logger
75
87
  _default_log_level: int
76
- _logfilename: Optional[str]
88
+ _logfilename: str | None
77
89
  _log_to_stdout: bool
78
90
  _module_log_level: dict[str, int] # module name -> log level
79
91
  _file_cache_size_g: float
80
- _pxt_api_key: Optional[str]
92
+ _default_input_media_dest: str | None
93
+ _default_output_media_dest: str | None
94
+ _pxt_api_key: str | None
81
95
  _stdout_handler: logging.StreamHandler
96
+ _default_video_encoder: str | None
82
97
  _initialized: bool
83
98
 
84
99
  _resource_pool_info: dict[str, Any]
85
- _current_conn: Optional[sql.Connection]
86
- _current_session: Optional[sql.orm.Session]
87
- _dbms: Optional[Dbms]
100
+ _current_conn: sql.Connection | None
101
+ _current_session: orm.Session | None
102
+ _current_isolation_level: str | None
103
+ _dbms: Dbms | None
104
+ _event_loop: asyncio.AbstractEventLoop | None # event loop for ExecNode
88
105
 
89
106
  @classmethod
90
107
  def get(cls) -> Env:
@@ -96,17 +113,24 @@ class Env:
96
113
  def _init_env(cls, reinit_db: bool = False) -> None:
97
114
  assert not cls.__initializing, 'Circular env initialization detected.'
98
115
  cls.__initializing = True
116
+ if cls._instance is not None:
117
+ cls._instance._clean_up()
118
+ cls._instance = None
99
119
  env = Env()
100
- env._set_up(reinit_db=reinit_db)
101
- env._upgrade_metadata()
102
- cls._instance = env
103
- cls.__initializing = False
120
+ try:
121
+ env._set_up(reinit_db=reinit_db)
122
+ env._upgrade_metadata()
123
+ cls._instance = env
124
+ finally:
125
+ # Reset the initializing flag, even if setup fails.
126
+ # This prevents the environment from being left in a broken state.
127
+ cls.__initializing = False
104
128
 
105
129
  def __init__(self) -> None:
106
130
  assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
107
131
 
108
132
  self._media_dir = None # computed media files
109
- self._file_cache_dir = None # cached media files with external URL
133
+ self._file_cache_dir = None # cached object files with external URL
110
134
  self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
111
135
  self._log_dir = None # log files
112
136
  self._tmp_dir = None # any tmp files
@@ -120,6 +144,7 @@ class Env:
120
144
  self._spacy_nlp = None
121
145
  self._httpd = None
122
146
  self._http_address = None
147
+ self._default_video_encoder = None
123
148
 
124
149
  # logging-related state
125
150
  self._logger = logging.getLogger('pixeltable')
@@ -139,7 +164,34 @@ class Env:
139
164
  self._resource_pool_info = {}
140
165
  self._current_conn = None
141
166
  self._current_session = None
167
+ self._current_isolation_level = None
142
168
  self._dbms = None
169
+ self._event_loop = None
170
+
171
+ def _init_event_loop(self) -> None:
172
+ try:
173
+ # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
174
+ # multiple run_until_complete()
175
+ running_loop = asyncio.get_running_loop()
176
+ self._event_loop = running_loop
177
+ _logger.debug('Patched running loop')
178
+ except RuntimeError:
179
+ self._event_loop = asyncio.new_event_loop()
180
+ asyncio.set_event_loop(self._event_loop)
181
+ # we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
182
+ self._event_loop.slow_callback_duration = 3600
183
+
184
+ # always allow nested event loops, we need that to run async udfs synchronously (eg, for SimilarityExpr);
185
+ # see run_coroutine_synchronously()
186
+ nest_asyncio.apply()
187
+ if _logger.isEnabledFor(logging.DEBUG):
188
+ self._event_loop.set_debug(True)
189
+
190
+ @property
191
+ def event_loop(self) -> asyncio.AbstractEventLoop:
192
+ if self._event_loop is None:
193
+ self._init_event_loop()
194
+ return self._event_loop
143
195
 
144
196
  @property
145
197
  def db_url(self) -> str:
@@ -152,11 +204,11 @@ class Env:
152
204
  return self._http_address
153
205
 
154
206
  @property
155
- def user(self) -> Optional[str]:
207
+ def user(self) -> str | None:
156
208
  return Config.get().get_string_value('user')
157
209
 
158
210
  @user.setter
159
- def user(self, user: Optional[str]) -> None:
211
+ def user(self, user: str | None) -> None:
160
212
  if user is None:
161
213
  if 'PIXELTABLE_USER' in os.environ:
162
214
  del os.environ['PIXELTABLE_USER']
@@ -164,33 +216,47 @@ class Env:
164
216
  os.environ['PIXELTABLE_USER'] = user
165
217
 
166
218
  @property
167
- def default_time_zone(self) -> Optional[ZoneInfo]:
219
+ def default_time_zone(self) -> ZoneInfo | None:
168
220
  return self._default_time_zone
169
221
 
170
222
  @default_time_zone.setter
171
- def default_time_zone(self, tz: Optional[ZoneInfo]) -> None:
223
+ def default_time_zone(self, tz: ZoneInfo | None) -> None:
172
224
  """
173
225
  This is not a publicly visible setter; it is only for testing purposes.
174
226
  """
175
- tz_name = None if tz is None else tz.key
227
+ if tz is None:
228
+ tz_name = self._get_tz_name()
229
+ else:
230
+ assert isinstance(tz, ZoneInfo)
231
+ tz_name = tz.key
176
232
  self.engine.dispose()
177
233
  self._create_engine(time_zone_name=tz_name)
178
234
 
179
235
  @property
180
- def conn(self) -> Optional[sql.Connection]:
236
+ def verbosity(self) -> int:
237
+ return self._verbosity
238
+
239
+ @property
240
+ def conn(self) -> sql.Connection | None:
181
241
  assert self._current_conn is not None
182
242
  return self._current_conn
183
243
 
184
244
  @property
185
- def session(self) -> Optional[sql.orm.Session]:
245
+ def session(self) -> orm.Session | None:
186
246
  assert self._current_session is not None
187
247
  return self._current_session
188
248
 
189
249
  @property
190
- def dbms(self) -> Optional[Dbms]:
250
+ def dbms(self) -> Dbms | None:
191
251
  assert self._dbms is not None
192
252
  return self._dbms
193
253
 
254
+ @property
255
+ def is_using_cockroachdb(self) -> bool:
256
+ assert self._dbms is not None
257
+ return isinstance(self._dbms, CockroachDbms)
258
+
259
+ @property
194
260
  def in_xact(self) -> bool:
195
261
  return self._current_conn is not None
196
262
 
@@ -200,32 +266,43 @@ class Env:
200
266
  return self._db_server is not None
201
267
 
202
268
  @contextmanager
203
- def begin_xact(self) -> Iterator[sql.Connection]:
204
- """Return a context manager that yields a connection to the database. Idempotent."""
269
+ def begin_xact(self, *, for_write: bool = False) -> Iterator[sql.Connection]:
270
+ """
271
+ Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
272
+
273
+ for_write: if True, uses serializable isolation; if False, uses repeatable_read
274
+
275
+ TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
276
+ that avoids tripping over any pending ops
277
+ """
205
278
  if self._current_conn is None:
206
279
  assert self._current_session is None
207
280
  try:
208
- with self.engine.begin() as conn, sql.orm.Session(conn) as session:
209
- # TODO: remove print() once we're done with debugging the concurrent update behavior
210
- # print(f'{datetime.datetime.now()}: start xact')
281
+ self._current_isolation_level = self.SERIALIZABLE_ISOLATION_LEVEL
282
+ with (
283
+ self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
284
+ orm.Session(conn) as session,
285
+ conn.begin(),
286
+ ):
211
287
  self._current_conn = conn
212
288
  self._current_session = session
213
289
  yield conn
214
290
  finally:
215
291
  self._current_session = None
216
292
  self._current_conn = None
217
- # print(f'{datetime.datetime.now()}: end xact')
293
+ self._current_isolation_level = None
218
294
  else:
219
295
  assert self._current_session is not None
296
+ assert self._current_isolation_level == self.SERIALIZABLE_ISOLATION_LEVEL or not for_write
220
297
  yield self._current_conn
221
298
 
222
299
  def configure_logging(
223
300
  self,
224
301
  *,
225
- to_stdout: Optional[bool] = None,
226
- level: Optional[int] = None,
227
- add: Optional[str] = None,
228
- remove: Optional[str] = None,
302
+ to_stdout: bool | None = None,
303
+ level: int | None = None,
304
+ add: str | None = None,
305
+ remove: str | None = None,
229
306
  ) -> None:
230
307
  """Configure logging.
231
308
 
@@ -267,7 +344,7 @@ class Env:
267
344
  def set_log_level(self, level: int) -> None:
268
345
  self._default_log_level = level
269
346
 
270
- def set_module_log_level(self, module: str, level: Optional[int]) -> None:
347
+ def set_module_log_level(self, module: str, level: int | None) -> None:
271
348
  if level is None:
272
349
  self._module_log_level.pop(module, None)
273
350
  else:
@@ -282,6 +359,8 @@ class Env:
282
359
  # accept log messages from a configured pixeltable module (at any level of the module hierarchy)
283
360
  path_parts = list(Path(record.pathname).parts)
284
361
  path_parts.reverse()
362
+ if 'pixeltable' not in path_parts:
363
+ return False
285
364
  max_idx = path_parts.index('pixeltable')
286
365
  for module_name in path_parts[:max_idx]:
287
366
  if module_name in self._module_log_level and record.levelno >= self._module_log_level[module_name]:
@@ -292,6 +371,26 @@ class Env:
292
371
  def console_logger(self) -> ConsoleLogger:
293
372
  return self._console_logger
294
373
 
374
+ def _get_tz_name(self) -> str:
375
+ """Get the time zone name from the configuration, or the system local time zone if not specified.
376
+
377
+ Returns:
378
+ str: The time zone name.
379
+ """
380
+ tz_name = Config.get().get_string_value('time_zone')
381
+ if tz_name is not None:
382
+ # Validate tzname
383
+ if not isinstance(tz_name, str):
384
+ self._logger.error('Invalid time zone specified in configuration.')
385
+ else:
386
+ try:
387
+ _ = ZoneInfo(tz_name)
388
+ except ZoneInfoNotFoundError:
389
+ self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
390
+ else:
391
+ tz_name = tzlocal.get_localzone_name()
392
+ return tz_name
393
+
295
394
  def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
296
395
  if self._initialized:
297
396
  return
@@ -301,22 +400,18 @@ class Env:
301
400
  config = Config.get()
302
401
 
303
402
  self._initialized = True
403
+
304
404
  self._media_dir = Config.get().home / 'media'
305
405
  self._file_cache_dir = Config.get().home / 'file_cache'
306
406
  self._dataset_cache_dir = Config.get().home / 'dataset_cache'
307
407
  self._log_dir = Config.get().home / 'logs'
308
408
  self._tmp_dir = Config.get().home / 'tmp'
309
409
 
310
- if not self._media_dir.exists():
311
- self._media_dir.mkdir()
312
- if not self._file_cache_dir.exists():
313
- self._file_cache_dir.mkdir()
314
- if not self._dataset_cache_dir.exists():
315
- self._dataset_cache_dir.mkdir()
316
- if not self._log_dir.exists():
317
- self._log_dir.mkdir()
318
- if not self._tmp_dir.exists():
319
- self._tmp_dir.mkdir()
410
+ self._media_dir.mkdir(exist_ok=True)
411
+ self._file_cache_dir.mkdir(exist_ok=True)
412
+ self._dataset_cache_dir.mkdir(exist_ok=True)
413
+ self._log_dir.mkdir(exist_ok=True)
414
+ self._tmp_dir.mkdir(exist_ok=True)
320
415
 
321
416
  self._file_cache_size_g = config.get_float_value('file_cache_size_g')
322
417
  if self._file_cache_size_g is None:
@@ -325,6 +420,16 @@ class Env:
325
420
  f'(either add a `file_cache_size_g` entry to the `pixeltable` section of {Config.get().config_file},\n'
326
421
  'or set the PIXELTABLE_FILE_CACHE_SIZE_G environment variable)'
327
422
  )
423
+
424
+ self._default_input_media_dest = config.get_string_value('input_media_dest')
425
+ self._default_output_media_dest = config.get_string_value('output_media_dest')
426
+ for mode, uri in (('input', self._default_input_media_dest), ('output', self._default_output_media_dest)):
427
+ if uri is not None:
428
+ try:
429
+ _ = ObjectPath.parse_object_storage_addr(uri, False)
430
+ except Exception as e:
431
+ raise excs.Error(f'Invalid {mode} media destination URI: {uri}') from e
432
+
328
433
  self._pxt_api_key = config.get_string_value('api_key')
329
434
 
330
435
  # Disable spurious warnings
@@ -334,10 +439,12 @@ class Env:
334
439
  warnings.simplefilter('ignore', category=UserWarning)
335
440
  warnings.simplefilter('ignore', category=FutureWarning)
336
441
 
337
- # Set verbose level for user visible console messages
338
- verbosity = map_level(config.get_int_value('verbosity'))
442
+ # Set verbosity level for user visible console messages
443
+ self._verbosity = config.get_int_value('verbosity')
444
+ if self._verbosity is None:
445
+ self._verbosity = 1
339
446
  stdout_handler = ConsoleOutputHandler(stream=stdout)
340
- stdout_handler.setLevel(verbosity)
447
+ stdout_handler.setLevel(map_level(self._verbosity))
341
448
  stdout_handler.addFilter(ConsoleMessageFilter())
342
449
  self._logger.addHandler(stdout_handler)
343
450
  self._console_logger = ConsoleLogger(self._logger)
@@ -371,6 +478,7 @@ class Env:
371
478
  http_logger.propagate = False
372
479
 
373
480
  self.clear_tmp_dir()
481
+ tz_name = self._get_tz_name()
374
482
 
375
483
  # configure pixeltable database
376
484
  self._init_db(config)
@@ -380,22 +488,10 @@ class Env:
380
488
  'Reinitializing pixeltable database is not supported when running in non-local environment'
381
489
  )
382
490
 
383
- tz_name = config.get_string_value('time_zone')
384
- if tz_name is not None:
385
- # Validate tzname
386
- if not isinstance(tz_name, str):
387
- self._logger.error('Invalid time zone specified in configuration.')
388
- else:
389
- try:
390
- _ = ZoneInfo(tz_name)
391
- except ZoneInfoNotFoundError:
392
- self._logger.error(f'Invalid time zone specified in configuration: {tz_name}')
393
-
394
491
  if reinit_db and self._store_db_exists():
395
492
  self._drop_store_db()
396
493
 
397
494
  create_db = not self._store_db_exists()
398
-
399
495
  if create_db:
400
496
  self._logger.info(f'creating database at: {self.db_url}')
401
497
  self._create_store_db()
@@ -440,7 +536,7 @@ class Env:
440
536
  raise excs.Error(error)
441
537
  self._logger.info(f'Using database at: {self.db_url}')
442
538
  else:
443
- self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
539
+ self._db_name = config.get_string_value('db') or 'pixeltable'
444
540
  self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(Config.get().home / 'pgdata')))
445
541
  # cleanup_mode=None will leave the postgres process running after Python exits
446
542
  # cleanup_mode='stop' will terminate the postgres process when Python exits
@@ -454,30 +550,50 @@ class Env:
454
550
  assert self._db_url is not None
455
551
  assert self._db_name is not None
456
552
 
553
+ @retry(
554
+ stop=stop_after_attempt(3), # Stop after 3 attempts
555
+ wait=wait_exponential_jitter(initial=0.2, max=1.0, jitter=0.2), # Exponential backoff with jitter
556
+ )
457
557
  def _init_metadata(self) -> None:
458
558
  """
459
559
  Create pixeltable metadata tables and system metadata.
460
560
  This is an idempotent operation.
561
+
562
+ Retry logic handles race conditions when multiple Pixeltable processes
563
+ attempt to initialize metadata tables simultaneously. The first process may succeed
564
+ in creating tables while others encounter database constraints (e.g., "table already exists").
565
+ Exponential backoff with jitter reduces contention between competing processes.
461
566
  """
462
567
  assert self._sa_engine is not None
463
568
  from pixeltable import metadata
464
569
 
570
+ self._logger.debug('Creating pixeltable metadata')
465
571
  metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
466
572
  metadata.create_system_info(self._sa_engine)
467
573
 
468
- def _create_engine(self, time_zone_name: Optional[str], echo: bool = False) -> None:
469
- connect_args = {} if time_zone_name is None else {'options': f'-c timezone={time_zone_name}'}
574
+ def _create_engine(self, time_zone_name: str, echo: bool = False) -> None:
575
+ # Add timezone option to connection string
576
+ updated_url = add_option_to_db_url(self.db_url, f'-c timezone={time_zone_name}')
577
+
470
578
  self._sa_engine = sql.create_engine(
471
- self.db_url, echo=echo, isolation_level=self._dbms.transaction_isolation_level, connect_args=connect_args
579
+ updated_url, echo=echo, isolation_level=self._dbms.transaction_isolation_level
472
580
  )
473
581
 
474
582
  self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
583
+ self._logger.info(f'Engine dialect: {self._sa_engine.dialect.name}')
584
+ self._logger.info(f'Engine driver : {self._sa_engine.dialect.driver}')
475
585
 
476
586
  with self.engine.begin() as conn:
477
587
  tz_name = conn.execute(sql.text('SHOW TIME ZONE')).scalar()
478
588
  assert isinstance(tz_name, str)
479
589
  self._logger.info(f'Database time zone is now: {tz_name}')
480
590
  self._default_time_zone = ZoneInfo(tz_name)
591
+ if self.is_using_cockroachdb:
592
+ # This could be set when the database is created, but we set it now
593
+ conn.execute(sql.text('SET null_ordered_last = true;'))
594
+ null_ordered_last = conn.execute(sql.text('SHOW null_ordered_last')).scalar()
595
+ assert isinstance(null_ordered_last, str)
596
+ self._logger.info(f'Database null_ordered_last is now: {null_ordered_last}')
481
597
 
482
598
  def _store_db_exists(self) -> bool:
483
599
  assert self._db_name is not None
@@ -512,6 +628,14 @@ class Env:
512
628
  finally:
513
629
  engine.dispose()
514
630
 
631
+ def _pgserver_terminate_connections_stmt(self) -> str:
632
+ return f"""
633
+ SELECT pg_terminate_backend(pg_stat_activity.pid)
634
+ FROM pg_stat_activity
635
+ WHERE pg_stat_activity.datname = '{self._db_name}'
636
+ AND pid <> pg_backend_pid()
637
+ """
638
+
515
639
  def _drop_store_db(self) -> None:
516
640
  assert self._db_name is not None
517
641
  engine = sql.create_engine(self._dbms.default_system_db_url(), future=True, isolation_level='AUTOCOMMIT')
@@ -520,13 +644,7 @@ class Env:
520
644
  with engine.begin() as conn:
521
645
  # terminate active connections
522
646
  if self._db_server is not None:
523
- stmt = f"""
524
- SELECT pg_terminate_backend(pg_stat_activity.pid)
525
- FROM pg_stat_activity
526
- WHERE pg_stat_activity.datname = '{self._db_name}'
527
- AND pid <> pg_backend_pid()
528
- """
529
- conn.execute(sql.text(stmt))
647
+ conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
530
648
  # drop db
531
649
  stmt = self._dbms.drop_db_stmt(preparer.quote(self._db_name))
532
650
  conn.execute(sql.text(stmt))
@@ -539,12 +657,7 @@ class Env:
539
657
  metadata.upgrade_md(self._sa_engine)
540
658
 
541
659
  @property
542
- def pxt_api_key(self) -> str:
543
- if self._pxt_api_key is None:
544
- raise excs.Error(
545
- 'No API key is configured. Set the PIXELTABLE_API_KEY environment variable, or add an entry to '
546
- 'config.toml as described here:\nhttps://pixeltable.github.io/pixeltable/config/'
547
- )
660
+ def pxt_api_key(self) -> str | None:
548
661
  return self._pxt_api_key
549
662
 
550
663
  def get_client(self, name: str) -> Any:
@@ -554,35 +667,51 @@ class Env:
554
667
  Args:
555
668
  - name: The name of the client
556
669
  """
557
- cl = _registered_clients[name]
558
- if cl.client_obj is not None:
559
- return cl.client_obj # Already initialized
560
-
561
- # Construct a client, retrieving each parameter from config.
562
-
563
- init_kwargs: dict[str, str] = {}
564
- for param in cl.param_names:
565
- arg = Config.get().get_string_value(param, section=name)
566
- if arg is not None and len(arg) > 0:
567
- init_kwargs[param] = arg
568
- else:
670
+ # Return the existing client if it has already been constructed
671
+ with _registered_clients_lock:
672
+ cl = _registered_clients[name]
673
+ if cl.client_obj is not None:
674
+ return cl.client_obj # Already initialized
675
+
676
+ # Retrieve parameters required to construct the requested client.
677
+ init_kwargs: dict[str, Any] = {}
678
+ for param in cl.params.values():
679
+ # Determine the type of the parameter for proper config parsing.
680
+ pname = param.name
681
+ t = param.annotation
682
+ # Deference T | None
683
+ if typing.get_origin(t) in (typing.Union, types.UnionType):
684
+ args = typing.get_args(t)
685
+ if args[0] is type(None):
686
+ t = args[1]
687
+ elif args[1] is type(None):
688
+ t = args[0]
689
+ assert isinstance(t, type), t
690
+ arg: Any = Config.get().get_value(pname, t, section=name)
691
+ if arg is not None:
692
+ init_kwargs[pname] = arg
693
+ elif param.default is inspect.Parameter.empty:
569
694
  raise excs.Error(
570
- f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
571
- f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, '
572
- f'or put `{param.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
695
+ f'`{name}` client not initialized: parameter `{pname}` is not configured.\n'
696
+ f'To fix this, specify the `{name.upper()}_{pname.upper()}` environment variable, '
697
+ f'or put `{pname.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
573
698
  )
574
699
 
575
- cl.client_obj = cl.init_fn(**init_kwargs)
576
- self._logger.info(f'Initialized `{name}` client.')
577
- return cl.client_obj
700
+ # Construct the requested client
701
+ with _registered_clients_lock:
702
+ if cl.client_obj is not None:
703
+ return cl.client_obj # Already initialized
704
+ cl.client_obj = cl.init_fn(**init_kwargs)
705
+ self._logger.info(f'Initialized `{name}` client with parameters: {init_kwargs}.')
706
+ return cl.client_obj
578
707
 
579
708
  def _start_web_server(self) -> None:
580
709
  """
581
710
  The http server root is the file system root.
582
711
  eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
583
- in windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
584
- This arrangement enables serving media hosted within _home,
585
- as well as external media inserted into pixeltable or produced by pixeltable.
712
+ On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
713
+ This arrangement enables serving objects hosted within _home,
714
+ as well as external objects inserted into pixeltable or produced by pixeltable.
586
715
  The port is chosen dynamically to prevent conflicts.
587
716
  """
588
717
  # Port 0 means OS picks one for us.
@@ -600,20 +729,65 @@ class Env:
600
729
 
601
730
  def _set_up_runtime(self) -> None:
602
731
  """Check for and start runtime services"""
732
+ register_heif_opener()
603
733
  self._start_web_server()
604
734
  self.__register_packages()
605
735
 
736
+ @property
737
+ def default_video_encoder(self) -> str | None:
738
+ if self._default_video_encoder is None:
739
+ self._default_video_encoder = self._determine_default_video_encoder()
740
+ return self._default_video_encoder
741
+
742
+ def _determine_default_video_encoder(self) -> str | None:
743
+ """
744
+ Returns the first available encoder from a list of candidates.
745
+
746
+ TODO:
747
+ - the user might prefer a hardware-accelerated encoder (eg, h264_nvenc or h264_videotoolbox)
748
+ - allow user override via a config option 'video_encoder'
749
+ """
750
+ # look for available encoders, in this order
751
+ candidates = [
752
+ 'libx264', # GPL, best quality
753
+ 'libopenh264', # BSD
754
+ ]
755
+
756
+ try:
757
+ # Get list of available encoders
758
+ result = subprocess.run(['ffmpeg', '-encoders'], capture_output=True, text=True, timeout=10, check=True)
759
+
760
+ if result.returncode == 0:
761
+ available_encoders = result.stdout
762
+ for encoder in candidates:
763
+ # ffmpeg -encoders output format: " V..... encoder_name description"
764
+ if f' {encoder} ' in available_encoders:
765
+ _logger.debug(f'Using H.264 encoder: {encoder}')
766
+ return encoder
767
+ except Exception:
768
+ pass
769
+ return None
770
+
606
771
  def __register_packages(self) -> None:
607
772
  """Declare optional packages that are utilized by some parts of the code."""
773
+ self.__register_package('accelerate')
608
774
  self.__register_package('anthropic')
775
+ self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
609
776
  self.__register_package('boto3')
610
777
  self.__register_package('datasets')
778
+ self.__register_package('diffusers')
611
779
  self.__register_package('fiftyone')
780
+ self.__register_package('twelvelabs')
781
+ self.__register_package('fal_client', library_name='fal-client')
612
782
  self.__register_package('fireworks', library_name='fireworks-ai')
783
+ self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
613
784
  self.__register_package('google.genai', library_name='google-genai')
785
+ self.__register_package('groq')
614
786
  self.__register_package('huggingface_hub', library_name='huggingface-hub')
615
787
  self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
788
+ self.__register_package('librosa')
616
789
  self.__register_package('llama_cpp', library_name='llama-cpp-python')
790
+ self.__register_package('mcp')
617
791
  self.__register_package('mistralai')
618
792
  self.__register_package('mistune')
619
793
  self.__register_package('ollama')
@@ -622,8 +796,10 @@ class Env:
622
796
  self.__register_package('pyarrow')
623
797
  self.__register_package('pydantic')
624
798
  self.__register_package('replicate')
799
+ self.__register_package('reve')
625
800
  self.__register_package('sentencepiece')
626
801
  self.__register_package('sentence_transformers', library_name='sentence-transformers')
802
+ self.__register_package('soundfile')
627
803
  self.__register_package('spacy')
628
804
  self.__register_package('tiktoken')
629
805
  self.__register_package('together')
@@ -631,11 +807,14 @@ class Env:
631
807
  self.__register_package('torchaudio')
632
808
  self.__register_package('torchvision')
633
809
  self.__register_package('transformers')
810
+ self.__register_package('voyageai')
634
811
  self.__register_package('whisper', library_name='openai-whisper')
635
812
  self.__register_package('whisperx')
636
813
  self.__register_package('yolox', library_name='pixeltable-yolox')
814
+ self.__register_package('lancedb')
815
+ self.__register_package('scenedetect')
637
816
 
638
- def __register_package(self, package_name: str, library_name: Optional[str] = None) -> None:
817
+ def __register_package(self, package_name: str, library_name: str | None = None) -> None:
639
818
  is_installed: bool
640
819
  try:
641
820
  is_installed = importlib.util.find_spec(package_name) is not None
@@ -647,7 +826,11 @@ class Env:
647
826
  library_name=library_name or package_name, # defaults to package_name unless specified otherwise
648
827
  )
649
828
 
650
- def require_package(self, package_name: str, min_version: Optional[list[int]] = None) -> None:
829
+ def require_binary(self, binary_name: str) -> None:
830
+ if not shutil.which(binary_name):
831
+ raise excs.Error(f'{binary_name} is not installed or not in PATH. Please install it to use this feature.')
832
+
833
+ def require_package(self, package_name: str, min_version: list[int] | None = None) -> None:
651
834
  """
652
835
  Checks whether the specified optional package is available. If not, raises an exception
653
836
  with an error message informing the user how to install it.
@@ -691,14 +874,8 @@ class Env:
691
874
  else:
692
875
  os.remove(path)
693
876
 
694
- def num_tmp_files(self) -> int:
695
- return len(glob.glob(f'{self._tmp_dir}/*'))
696
-
697
- def create_tmp_path(self, extension: str = '') -> Path:
698
- return self._tmp_dir / f'{uuid.uuid4()}{extension}'
699
-
700
- # def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
701
- def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
877
+ # def get_resource_pool_info(self, pool_id: str, pool_info_cls: Type[T] | None) -> T:
878
+ def get_resource_pool_info(self, pool_id: str, make_pool_info: Callable[[], T] | None = None) -> T:
702
879
  """Returns the info object for the given id, creating it if necessary."""
703
880
  info = self._resource_pool_info.get(pool_id)
704
881
  if info is None and make_pool_info is not None:
@@ -711,6 +888,14 @@ class Env:
711
888
  assert self._media_dir is not None
712
889
  return self._media_dir
713
890
 
891
+ @property
892
+ def default_input_media_dest(self) -> str | None:
893
+ return self._default_input_media_dest
894
+
895
+ @property
896
+ def default_output_media_dest(self) -> str | None:
897
+ return self._default_output_media_dest
898
+
714
899
  @property
715
900
  def file_cache_dir(self) -> Path:
716
901
  assert self._file_cache_dir is not None
@@ -746,24 +931,74 @@ class Env:
746
931
  have no sub-dependencies (in fact, this is how spaCy normally manages its model resources).
747
932
  """
748
933
  import spacy
749
- from spacy.cli.download import get_model_filename
934
+ from spacy.cli.download import download
750
935
 
751
936
  spacy_model = 'en_core_web_sm'
752
- spacy_model_version = '3.7.1'
753
- filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
754
- url = f'{spacy.about.__download_url__}/{filename}'
755
- # Try to `pip install` the model. We set check=False; if the pip command fails, it's not necessarily
756
- # a problem, because the model might have been installed on a previous attempt.
757
- self._logger.info(f'Ensuring spaCy model is installed: {filename}')
758
- ret = subprocess.run([sys.executable, '-m', 'pip', 'install', '-qU', url], check=False)
759
- if ret.returncode != 0:
760
- self._logger.warning(f'pip install failed for spaCy model: {filename}')
937
+ self._logger.info(f'Ensuring spaCy model is installed: {spacy_model}')
938
+ download(spacy_model)
761
939
  self._logger.info(f'Loading spaCy model: {spacy_model}')
762
940
  try:
763
941
  self._spacy_nlp = spacy.load(spacy_model)
764
942
  except Exception as exc:
765
943
  raise excs.Error(f'Failed to load spaCy model: {spacy_model}') from exc
766
944
 
945
+ def _clean_up(self) -> None:
946
+ """
947
+ Internal cleanup method that properly closes all resources and resets state.
948
+ This is called before destroying the singleton instance.
949
+ """
950
+ assert self._current_session is None
951
+ assert self._current_conn is None
952
+
953
+ # Stop HTTP server
954
+ if self._httpd is not None:
955
+ try:
956
+ self._httpd.shutdown()
957
+ self._httpd.server_close()
958
+ except Exception as e:
959
+ _logger.warning(f'Error stopping HTTP server: {e}')
960
+
961
+ # First terminate all connections to the database
962
+ if self._db_server is not None:
963
+ assert self._dbms is not None
964
+ assert self._db_name is not None
965
+ try:
966
+ temp_engine = sql.create_engine(self._dbms.default_system_db_url(), isolation_level='AUTOCOMMIT')
967
+ try:
968
+ with temp_engine.begin() as conn:
969
+ conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
970
+ _logger.info(f"Terminated all connections to database '{self._db_name}'")
971
+ except Exception as e:
972
+ _logger.warning(f'Error terminating database connections: {e}')
973
+ finally:
974
+ temp_engine.dispose()
975
+ except Exception as e:
976
+ _logger.warning(f'Error stopping database server: {e}')
977
+
978
+ # Dispose of SQLAlchemy engine (after stopping db server)
979
+ if self._sa_engine is not None:
980
+ try:
981
+ self._sa_engine.dispose()
982
+ except Exception as e:
983
+ _logger.warning(f'Error disposing engine: {e}')
984
+
985
+ # Close event loop
986
+ if self._event_loop is not None:
987
+ try:
988
+ if self._event_loop.is_running():
989
+ self._event_loop.stop()
990
+ self._event_loop.close()
991
+ except Exception as e:
992
+ _logger.warning(f'Error closing event loop: {e}')
993
+
994
+ # Remove logging handlers
995
+ for handler in self._logger.handlers[:]:
996
+ try:
997
+ handler.close()
998
+ self._logger.removeHandler(handler)
999
+ except Exception as e:
1000
+ _logger.warning(f'Error removing handler: {e}')
1001
+
767
1002
 
768
1003
  def register_client(name: str) -> Callable:
769
1004
  """Decorator that registers a third-party API client for use by Pixeltable.
@@ -792,30 +1027,35 @@ def register_client(name: str) -> Callable:
792
1027
 
793
1028
  def decorator(fn: Callable) -> None:
794
1029
  sig = inspect.signature(fn)
795
- param_names = list(sig.parameters.keys())
796
- _registered_clients[name] = ApiClient(init_fn=fn, param_names=param_names)
1030
+ params = dict(sig.parameters)
1031
+ with _registered_clients_lock:
1032
+ _registered_clients[name] = ApiClient(init_fn=fn, params=params)
797
1033
 
798
1034
  return decorator
799
1035
 
800
1036
 
1037
+ _registered_clients_lock: threading.Lock = threading.Lock()
801
1038
  _registered_clients: dict[str, ApiClient] = {}
802
1039
 
803
1040
 
804
1041
  @dataclass
805
1042
  class ApiClient:
806
1043
  init_fn: Callable
807
- param_names: list[str]
808
- client_obj: Optional[Any] = None
1044
+ params: dict[str, inspect.Parameter]
1045
+ client_obj: Any | None = None
809
1046
 
810
1047
 
811
1048
  @dataclass
812
1049
  class PackageInfo:
813
1050
  is_installed: bool
814
1051
  library_name: str # pypi library name (may be different from package name)
815
- version: Optional[list[int]] = None # installed version, as a list of components (such as [3,0,2] for "3.0.2")
1052
+ version: list[int] | None = None # installed version, as a list of components (such as [3,0,2] for "3.0.2")
816
1053
 
817
1054
 
818
1055
  TIME_FORMAT = '%H:%M.%S %f'
1056
+ # As far as rate limiting goes, we try not go lower than 5% of the capacity because we don't have perfect information
1057
+ # about the rate limits and the usage
1058
+ TARGET_RATE_LIMIT_RESOURCE_FRACT = 0.05
819
1059
 
820
1060
 
821
1061
  @dataclass
@@ -838,6 +1078,10 @@ class RateLimitsInfo:
838
1078
  get_request_resources: Callable[..., dict[str, int]]
839
1079
 
840
1080
  resource_limits: dict[str, RateLimitInfo] = field(default_factory=dict)
1081
+ has_exc: bool = False
1082
+
1083
+ def debug_str(self) -> str:
1084
+ return ','.join(info.debug_str() for info in self.resource_limits.values())
841
1085
 
842
1086
  def is_initialized(self) -> bool:
843
1087
  return len(self.resource_limits) > 0
@@ -845,25 +1089,46 @@ class RateLimitsInfo:
845
1089
  def reset(self) -> None:
846
1090
  self.resource_limits.clear()
847
1091
 
848
- def record(self, **kwargs: Any) -> None:
849
- now = datetime.datetime.now(tz=datetime.timezone.utc)
1092
+ def record(self, request_ts: datetime.datetime, reset_exc: bool = False, **kwargs: Any) -> None:
1093
+ """Update self.resource_limits with the provided rate limit info.
1094
+ Args:
1095
+ - request_ts: time at which the request was made
1096
+ - reset_exc: if True, reset the has_exc flag
1097
+ """
850
1098
  if len(self.resource_limits) == 0:
851
- self.resource_limits = {k: RateLimitInfo(k, now, *v) for k, v in kwargs.items() if v is not None}
1099
+ self.resource_limits = {k: RateLimitInfo(k, request_ts, *v) for k, v in kwargs.items() if v is not None}
852
1100
  # TODO: remove
853
1101
  for info in self.resource_limits.values():
854
- _logger.debug(
855
- f'Init {info.resource} rate limit: rem={info.remaining} '
856
- f'reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
857
- )
1102
+ _logger.debug(f'Updated resource state: {info}')
858
1103
  else:
1104
+ if self.has_exc and not reset_exc:
1105
+ # ignore updates until we're asked to reset
1106
+ _logger.debug(f'rate_limits.record(): ignoring update {kwargs}')
1107
+ return
1108
+ self.has_exc = False
859
1109
  for k, v in kwargs.items():
860
1110
  if v is not None:
861
- self.resource_limits[k].update(now, *v)
1111
+ self.resource_limits[k].update(request_ts, *v)
1112
+ _logger.debug(f'Updated resource state: {self.resource_limits[k]}')
862
1113
 
863
- @abstractmethod
864
- def get_retry_delay(self, exc: Exception) -> Optional[float]:
1114
+ def record_exc(self, request_ts: datetime.datetime, exc: Exception) -> None:
1115
+ """Update self.resource_limits based on the exception headers
1116
+ Args:
1117
+ - request_ts: time at which the request that caused the exception was made
1118
+ - exc: the exception raised"""
1119
+ self.has_exc = True
1120
+
1121
+ def get_retry_delay(self, exc: Exception, attempt: int) -> float | None:
865
1122
  """Returns number of seconds to wait before retry, or None if not retryable"""
866
- pass
1123
+ # Find the highest wait until at least 5% availability of all resources
1124
+ max_wait = 0.0
1125
+ for limit_info in self.resource_limits.values():
1126
+ time_until = limit_info.estimated_resource_refill_delay(
1127
+ math.ceil(TARGET_RATE_LIMIT_RESOURCE_FRACT * limit_info.limit)
1128
+ )
1129
+ if time_until is not None:
1130
+ max_wait = max(max_wait, time_until)
1131
+ return max_wait if max_wait > 0 else None
867
1132
 
868
1133
 
869
1134
  @dataclass
@@ -871,22 +1136,71 @@ class RateLimitInfo:
871
1136
  """Container for rate limit-related information for a single resource."""
872
1137
 
873
1138
  resource: str
874
- recorded_at: datetime.datetime
1139
+ request_start_ts: datetime.datetime
875
1140
  limit: int
876
1141
  remaining: int
877
1142
  reset_at: datetime.datetime
878
1143
 
879
- def update(self, recorded_at: datetime.datetime, limit: int, remaining: int, reset_at: datetime.datetime) -> None:
880
- # we always update everything, even though responses may come back out-of-order: we can't use reset_at to
881
- # determine order, because it doesn't increase monotonically (the reeset duration shortens as output_tokens
882
- # are freed up - going from max to actual)
883
- self.recorded_at = recorded_at
1144
+ def debug_str(self) -> str:
1145
+ return (
1146
+ f'{self.resource}@{self.request_start_ts.strftime(TIME_FORMAT)}: '
1147
+ f'{self.limit}/{self.remaining}/{self.reset_at.strftime(TIME_FORMAT)}'
1148
+ )
1149
+
1150
+ def update(
1151
+ self, request_start_ts: datetime.datetime, limit: int, remaining: int, reset_at: datetime.datetime
1152
+ ) -> None:
1153
+ # Responses can come out of order, especially for failed requests. We need to be careful not to overwrite
1154
+ # the current state with less up-to-date information. We use request_start_ts as a proxy for rate limit info
1155
+ # recency.
1156
+ if self.request_start_ts > request_start_ts:
1157
+ # The current state is more up-to-date than the update
1158
+ _logger.debug(
1159
+ f'Ignoring out-of-date update for {self.resource}. Current request_start_ts: '
1160
+ f'{self.request_start_ts}, update: {request_start_ts}'
1161
+ )
1162
+ return
1163
+ self.request_start_ts = request_start_ts
884
1164
  self.limit = limit
885
1165
  self.remaining = remaining
886
- reset_delta = reset_at - self.reset_at
887
1166
  self.reset_at = reset_at
888
- # TODO: remove
889
- _logger.debug(
890
- f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} '
891
- f'reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
1167
+
1168
+ def estimated_resource_refill_delay(self, target_remaining: int) -> float | None:
1169
+ """Estimate time in seconds until remaining resources reaches target_remaining.
1170
+ Assumes linear replenishment of resources over time.
1171
+ Returns None if unable to estimate.
1172
+ """
1173
+ if self.remaining >= target_remaining:
1174
+ return 0
1175
+ if self.request_start_ts >= self.reset_at:
1176
+ return 0
1177
+ if self.limit < target_remaining:
1178
+ return None
1179
+
1180
+ # Estimate resource refill rate based on the recorded state and timestamps. Assumes linear refill.
1181
+ refill_rate = (self.limit - self.remaining) / (self.reset_at - self.request_start_ts).total_seconds()
1182
+ assert refill_rate > 0, f'self={self}, target_remaining={target_remaining}'
1183
+
1184
+ now = datetime.datetime.now(tz=datetime.timezone.utc)
1185
+ time_until = (target_remaining - self.remaining) / refill_rate - (now - self.request_start_ts).total_seconds()
1186
+ return max(0, math.ceil(time_until))
1187
+
1188
+ def __repr__(self) -> str:
1189
+ return (
1190
+ f'RateLimitInfo(resource={self.resource}, request_start_ts={self.request_start_ts}, '
1191
+ f'remaining={self.remaining}/{self.limit} ({(100 * self.remaining / self.limit):.1f}%), '
1192
+ f'reset_at={self.reset_at})'
892
1193
  )
1194
+
1195
+
1196
+ @dataclass
1197
+ class RuntimeCtx:
1198
+ """
1199
+ Container for runtime data provided by the execution system to udfs.
1200
+
1201
+ Udfs that accept the special _runtime_ctx parameter receive an instance of this class.
1202
+ """
1203
+
1204
+ # Indicates a retry attempt following a rate limit error (error code: 429). Requires a 'rate-limits' resource pool.
1205
+ # If True, call RateLimitsInfo.record() with reset_exc=True.
1206
+ is_retry: bool = False