pixeltable 0.4.8__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

pixeltable/__init__.py CHANGED
@@ -30,7 +30,7 @@ from .globals import (
30
30
  from .type_system import Array, Audio, Bool, Date, Document, Float, Image, Int, Json, Required, String, Timestamp, Video
31
31
 
32
32
  # This import must go last to avoid circular imports.
33
- from . import ext, functions, io, iterators # isort: skip
33
+ from . import functions, io, iterators # isort: skip
34
34
 
35
35
  # This is the safest / most maintainable way to construct __all__: start with the default and "blacklist"
36
36
  # stuff that we don't want in there. (Using a "whitelist" is considerably harder to maintain.)
@@ -2,13 +2,17 @@ from __future__ import annotations
2
2
 
3
3
  import enum
4
4
  import logging
5
- from typing import TYPE_CHECKING, Any, Literal, Optional, overload
5
+ from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, cast, overload
6
6
  from uuid import UUID
7
7
 
8
+ import pydantic
9
+ import pydantic_core
10
+
8
11
  import pixeltable as pxt
9
12
  from pixeltable import exceptions as excs, type_system as ts
10
13
  from pixeltable.env import Env
11
14
  from pixeltable.utils.filecache import FileCache
15
+ from pixeltable.utils.pydantic import is_json_convertible
12
16
 
13
17
  from .globals import MediaValidation
14
18
  from .table import Table
@@ -137,8 +141,24 @@ class InsertableTable(Table):
137
141
  from pixeltable.catalog import Catalog
138
142
  from pixeltable.io.table_data_conduit import UnkTableDataConduit
139
143
 
144
+ if source is not None and isinstance(source, Sequence) and len(source) == 0:
145
+ raise excs.Error('Cannot insert an empty sequence')
146
+ fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
147
+
140
148
  with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
141
149
  table = self
150
+
151
+ # TODO: unify with TableDataConduit
152
+ if source is not None and isinstance(source, Sequence) and isinstance(source[0], pydantic.BaseModel):
153
+ status = self._insert_pydantic(
154
+ cast(Sequence[pydantic.BaseModel], source), # needed for mypy
155
+ print_stats=print_stats,
156
+ fail_on_exception=fail_on_exception,
157
+ )
158
+ Env.get().console_logger.info(status.insert_msg)
159
+ FileCache.get().emit_eviction_warnings()
160
+ return status
161
+
142
162
  if source is None:
143
163
  source = [kwargs]
144
164
  kwargs = None
@@ -154,7 +174,6 @@ class InsertableTable(Table):
154
174
  data_source.add_table_info(table)
155
175
  data_source.prepare_for_insert_into_table()
156
176
 
157
- fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
158
177
  return table.insert_table_data_source(
159
178
  data_source=data_source, fail_on_exception=fail_on_exception, print_stats=print_stats
160
179
  )
@@ -184,32 +203,110 @@ class InsertableTable(Table):
184
203
  FileCache.get().emit_eviction_warnings()
185
204
  return status
186
205
 
187
- def _validate_input_rows(self, rows: list[dict[str, Any]]) -> None:
188
- """Verify that the input rows match the table schema"""
189
- valid_col_names = set(self._get_schema().keys())
190
- reqd_col_names = set(self._tbl_version_path.tbl_version.get().get_required_col_names())
191
- computed_col_names = set(self._tbl_version_path.tbl_version.get().get_computed_col_names())
192
- for row in rows:
193
- assert isinstance(row, dict)
194
- col_names = set(row.keys())
195
- if len(reqd_col_names - col_names) > 0:
196
- raise excs.Error(f'Missing required column(s) ({", ".join(reqd_col_names - col_names)}) in row {row}')
197
-
198
- for col_name, val in row.items():
199
- if col_name not in valid_col_names:
200
- raise excs.Error(f'Unknown column name {col_name!r} in row {row}')
201
- if col_name in computed_col_names:
202
- raise excs.Error(f'Value for computed column {col_name!r} in row {row}')
203
-
204
- # validate data
205
- col = self._tbl_version_path.get_column(col_name)
206
- try:
207
- # basic sanity checks here
208
- checked_val = col.col_type.create_literal(val)
209
- row[col_name] = checked_val
210
- except TypeError as e:
211
- msg = str(e)
212
- raise excs.Error(f'Error in column {col.name}: {msg[0].lower() + msg[1:]}\nRow: {row}') from e
206
+ def _insert_pydantic(
207
+ self, rows: Sequence[pydantic.BaseModel], print_stats: bool = False, fail_on_exception: bool = True
208
+ ) -> UpdateStatus:
209
+ model_class = type(rows[0])
210
+ self._validate_pydantic_model(model_class)
211
+ # convert rows one-by-one in order to be able to print meaningful error messages
212
+ pxt_rows: list[dict[str, Any]] = []
213
+ for i, row in enumerate(rows):
214
+ try:
215
+ pxt_rows.append(row.model_dump(mode='json'))
216
+ except pydantic_core.PydanticSerializationError as e:
217
+ raise excs.Error(f'Row {i}: error serializing pydantic model to JSON:\n{e!s}') from e
218
+
219
+ # explicitly check that all required columns are present and non-None in the rows,
220
+ # because we ignore nullability when validating the pydantic model
221
+ reqd_col_names = [col.name for col in self._tbl_version_path.columns() if col.is_required_for_insert]
222
+ for i, pxt_row in enumerate(pxt_rows):
223
+ if type(rows[i]) is not model_class:
224
+ raise excs.Error(
225
+ f'Expected {model_class.__name__!r} instance, got {type(rows[i]).__name__!r} (in row {i})'
226
+ )
227
+ for col_name in reqd_col_names:
228
+ if pxt_row.get(col_name) is None:
229
+ raise excs.Error(f'Missing required column {col_name!r} in row {i}')
230
+
231
+ status = self._tbl_version.get().insert(
232
+ rows=pxt_rows, df=None, print_stats=print_stats, fail_on_exception=fail_on_exception
233
+ )
234
+ return status
235
+
236
+ def _validate_pydantic_model(self, model: type[pydantic.BaseModel]) -> None:
237
+ """
238
+ Check if a Pydantic model is compatible with this table for insert operations.
239
+
240
+ A model is compatible if:
241
+ - All required table columns have corresponding model fields with compatible types
242
+ - Model does not define fields for computed columns
243
+ - Model field types are compatible with table column types
244
+ """
245
+ assert isinstance(model, type) and issubclass(model, pydantic.BaseModel)
246
+
247
+ schema = self._get_schema()
248
+ required_cols = set(self._tbl_version.get().get_required_col_names())
249
+ computed_cols = set(self._tbl_version.get().get_computed_col_names())
250
+ model_fields = model.model_fields
251
+ model_field_names = set(model_fields.keys())
252
+
253
+ missing_required = required_cols - model_field_names
254
+ if missing_required:
255
+ raise excs.Error(
256
+ f'Pydantic model {model.__name__!r} is missing required columns: '
257
+ f'{", ".join(f"{col_name!r}" for col_name in missing_required)}'
258
+ )
259
+
260
+ computed_in_model = computed_cols & model_field_names
261
+ if computed_in_model:
262
+ raise excs.Error(
263
+ f'Pydantic model {model.__name__!r} has fields for computed columns: '
264
+ f'{", ".join(f"{col_name!r}" for col_name in computed_in_model)}'
265
+ )
266
+
267
+ # validate type compatibility
268
+ common_fields = model_field_names & set(schema.keys())
269
+ if len(common_fields) == 0:
270
+ raise excs.Error(
271
+ f'Pydantic model {model.__name__!r} has no fields that map to columns in table {self._name!r}'
272
+ )
273
+ for field_name in common_fields:
274
+ pxt_col_type = schema[field_name]
275
+ model_field = model_fields[field_name]
276
+ model_type = model_field.annotation
277
+
278
+ # we ignore nullability: we want to accept optional model fields for required table columns, as long as
279
+ # the model instances provide a non-null value
280
+ # allow_enum=True: model_dump(mode='json') converts enums to their values
281
+ inferred_pxt_type = ts.ColumnType.from_python_type(model_type, infer_pydantic_json=True)
282
+ if inferred_pxt_type is None:
283
+ raise excs.Error(
284
+ f'Pydantic model {model.__name__!r}: cannot infer Pixeltable type for column {field_name!r}'
285
+ )
286
+
287
+ if pxt_col_type.is_media_type():
288
+ # media types require file paths, either as str or Path
289
+ if not inferred_pxt_type.is_string_type():
290
+ raise excs.Error(
291
+ f"Column {field_name!r} requires a 'str' or 'Path' field in {model.__name__!r}, but it is "
292
+ f'{model_type.__name__!r}'
293
+ )
294
+ else:
295
+ if not pxt_col_type.is_supertype_of(inferred_pxt_type, ignore_nullable=True):
296
+ raise excs.Error(
297
+ f'Pydantic model {model.__name__!r} has incompatible type ({model_type.__name__}) '
298
+ f'for column {field_name!r} ({pxt_col_type})'
299
+ )
300
+
301
+ if (
302
+ isinstance(model_type, type)
303
+ and issubclass(model_type, pydantic.BaseModel)
304
+ and not is_json_convertible(model_type)
305
+ ):
306
+ raise excs.Error(
307
+ f'Pydantic model {model.__name__!r} has field {field_name!r} with nested model '
308
+ f'{model_type.__name__!r}, which is not JSON-convertible'
309
+ )
213
310
 
214
311
  def delete(self, where: Optional['exprs.Expr'] = None) -> UpdateStatus:
215
312
  """Delete rows in this table.
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import abc
4
4
  import builtins
5
+ import datetime
5
6
  import json
6
7
  import logging
7
8
  from keyword import iskeyword as is_python_keyword
@@ -9,7 +10,6 @@ from pathlib import Path
9
10
  from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Literal, Optional, TypedDict, overload
10
11
 
11
12
  from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
12
- import datetime
13
13
  from uuid import UUID
14
14
 
15
15
  import pandas as pd
@@ -1356,6 +1356,15 @@ class Table(SchemaObject):
1356
1356
  Insert rows from a CSV file:
1357
1357
 
1358
1358
  >>> tbl.insert(source='path/to/file.csv')
1359
+
1360
+ Insert Pydantic model instances into a table with two `pxt.Int` columns `a` and `b`:
1361
+
1362
+ >>> class MyModel(pydantic.BaseModel):
1363
+ ... a: int
1364
+ ... b: int
1365
+ ...
1366
+ ... models = [MyModel(a=1, b=2), MyModel(a=3, b=4)]
1367
+ ... tbl.insert(models)
1359
1368
  """
1360
1369
  raise NotImplementedError
1361
1370
 
pixeltable/config.py CHANGED
@@ -167,6 +167,7 @@ KNOWN_CONFIG_OPTIONS = {
167
167
  'deepseek': {'api_key': 'Deepseek API key', 'rate_limit': 'Rate limit for Deepseek API requests'},
168
168
  'fireworks': {'api_key': 'Fireworks API key', 'rate_limit': 'Rate limit for Fireworks API requests'},
169
169
  'gemini': {'api_key': 'Gemini API key', 'rate_limits': 'Per-model rate limits for Gemini API requests'},
170
+ 'hf': {'auth_token': 'Hugging Face access token'},
170
171
  'imagen': {'rate_limits': 'Per-model rate limits for Imagen API requests'},
171
172
  'veo': {'rate_limits': 'Per-model rate limits for Veo API requests'},
172
173
  'groq': {'api_key': 'Groq API key', 'rate_limit': 'Rate limit for Groq API requests'},
pixeltable/env.py CHANGED
@@ -11,6 +11,7 @@ import logging
11
11
  import os
12
12
  import platform
13
13
  import shutil
14
+ import subprocess
14
15
  import sys
15
16
  import threading
16
17
  import types
@@ -27,6 +28,7 @@ import nest_asyncio # type: ignore[import-untyped]
27
28
  import pixeltable_pgserver
28
29
  import sqlalchemy as sql
29
30
  from pillow_heif import register_heif_opener # type: ignore[import-untyped]
31
+ from tenacity import retry, stop_after_attempt, wait_exponential_jitter
30
32
  from tqdm import TqdmWarning
31
33
 
32
34
  from pixeltable import exceptions as excs
@@ -81,6 +83,7 @@ class Env:
81
83
  _file_cache_size_g: float
82
84
  _pxt_api_key: Optional[str]
83
85
  _stdout_handler: logging.StreamHandler
86
+ _default_video_encoder: str | None
84
87
  _initialized: bool
85
88
 
86
89
  _resource_pool_info: dict[str, Any]
@@ -104,10 +107,14 @@ class Env:
104
107
  cls._instance._clean_up()
105
108
  cls._instance = None
106
109
  env = Env()
107
- env._set_up(reinit_db=reinit_db)
108
- env._upgrade_metadata()
109
- cls._instance = env
110
- cls.__initializing = False
110
+ try:
111
+ env._set_up(reinit_db=reinit_db)
112
+ env._upgrade_metadata()
113
+ cls._instance = env
114
+ finally:
115
+ # Reset the initializing flag, even if setup fails.
116
+ # This prevents the environment from being left in a broken state.
117
+ cls.__initializing = False
111
118
 
112
119
  def __init__(self) -> None:
113
120
  assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
@@ -127,6 +134,7 @@ class Env:
127
134
  self._spacy_nlp = None
128
135
  self._httpd = None
129
136
  self._http_address = None
137
+ self._default_video_encoder = None
130
138
 
131
139
  # logging-related state
132
140
  self._logger = logging.getLogger('pixeltable')
@@ -500,14 +508,24 @@ class Env:
500
508
  assert self._db_url is not None
501
509
  assert self._db_name is not None
502
510
 
511
+ @retry(
512
+ stop=stop_after_attempt(3), # Stop after 3 attempts
513
+ wait=wait_exponential_jitter(initial=0.2, max=1.0, jitter=0.2), # Exponential backoff with jitter
514
+ )
503
515
  def _init_metadata(self) -> None:
504
516
  """
505
517
  Create pixeltable metadata tables and system metadata.
506
518
  This is an idempotent operation.
519
+
520
+ Retry logic handles race conditions when multiple Pixeltable processes
521
+ attempt to initialize metadata tables simultaneously. The first process may succeed
522
+ in creating tables while others encounter database constraints (e.g., "table already exists").
523
+ Exponential backoff with jitter reduces contention between competing processes.
507
524
  """
508
525
  assert self._sa_engine is not None
509
526
  from pixeltable import metadata
510
527
 
528
+ self._logger.debug('Creating pixeltable metadata')
511
529
  metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
512
530
  metadata.create_system_info(self._sa_engine)
513
531
 
@@ -662,6 +680,41 @@ class Env:
662
680
  self._start_web_server()
663
681
  self.__register_packages()
664
682
 
683
+ @property
684
+ def default_video_encoder(self) -> str | None:
685
+ if self._default_video_encoder is None:
686
+ self._default_video_encoder = self._determine_default_video_encoder()
687
+ return self._default_video_encoder
688
+
689
+ def _determine_default_video_encoder(self) -> str | None:
690
+ """
691
+ Returns the first available encoder from a list of candidates.
692
+
693
+ TODO:
694
+ - the user might prefer a hardware-accelerated encoder (eg, h264_nvenc or h264_videotoolbox)
695
+ - allow user override via a config option 'video_encoder'
696
+ """
697
+ # look for available encoders, in this order
698
+ candidates = [
699
+ 'libx264', # GPL, best quality
700
+ 'libopenh264', # BSD
701
+ ]
702
+
703
+ try:
704
+ # Get list of available encoders
705
+ result = subprocess.run(['ffmpeg', '-encoders'], capture_output=True, text=True, timeout=10, check=True)
706
+
707
+ if result.returncode == 0:
708
+ available_encoders = result.stdout
709
+ for encoder in candidates:
710
+ # ffmpeg -encoders output format: " V..... encoder_name description"
711
+ if f' {encoder} ' in available_encoders:
712
+ _logger.debug(f'Using H.264 encoder: {encoder}')
713
+ return encoder
714
+ except Exception:
715
+ pass
716
+ return None
717
+
665
718
  def __register_packages(self) -> None:
666
719
  """Declare optional packages that are utilized by some parts of the code."""
667
720
  self.__register_package('anthropic')
@@ -26,6 +26,8 @@ from . import (
26
26
  video,
27
27
  vision,
28
28
  whisper,
29
+ whisperx,
30
+ yolox,
29
31
  )
30
32
  from .globals import count, map, max, mean, min, sum
31
33
 
@@ -3,6 +3,7 @@ Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs)
3
3
  """
4
4
 
5
5
  import pixeltable as pxt
6
+ import pixeltable.utils.av as av_utils
6
7
  from pixeltable.utils.code import local_public_names
7
8
 
8
9
 
@@ -47,7 +48,7 @@ def get_metadata(audio: pxt.Audio) -> dict:
47
48
 
48
49
  >>> tbl.select(tbl.audio_col.get_metadata()).collect()
49
50
  """
50
- return pxt.functions.video._get_metadata(audio)
51
+ return av_utils.get_metadata(audio)
51
52
 
52
53
 
53
54
  __all__ = local_public_names(__name__)
@@ -14,6 +14,7 @@ import PIL.Image
14
14
 
15
15
  import pixeltable as pxt
16
16
  from pixeltable import env, exceptions as excs, exprs
17
+ from pixeltable.utils.code import local_public_names
17
18
  from pixeltable.utils.media_store import TempStore
18
19
 
19
20
  if TYPE_CHECKING:
@@ -232,3 +233,10 @@ async def generate_videos(
232
233
  @generate_videos.resource_pool
233
234
  def _(model: str) -> str:
234
235
  return f'request-rate:veo:{model}'
236
+
237
+
238
+ __all__ = local_public_names(__name__)
239
+
240
+
241
+ def __dir__() -> list[str]:
242
+ return __all__