pixeltable 0.4.8__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/catalog/insertable_table.py +125 -28
- pixeltable/catalog/table.py +10 -1
- pixeltable/config.py +1 -0
- pixeltable/env.py +57 -4
- pixeltable/functions/__init__.py +2 -0
- pixeltable/functions/audio.py +2 -1
- pixeltable/functions/gemini.py +8 -0
- pixeltable/functions/video.py +534 -81
- pixeltable/functions/whisper.py +8 -0
- pixeltable/functions/whisperx.py +177 -0
- pixeltable/{ext/functions → functions}/yolox.py +0 -4
- pixeltable/globals.py +3 -1
- pixeltable/iterators/video.py +138 -0
- pixeltable/metadata/__init__.py +3 -1
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/type_system.py +20 -4
- pixeltable/utils/av.py +111 -0
- pixeltable/utils/code.py +2 -1
- pixeltable/utils/pydantic.py +60 -0
- {pixeltable-0.4.8.dist-info → pixeltable-0.4.10.dist-info}/METADATA +1 -1
- {pixeltable-0.4.8.dist-info → pixeltable-0.4.10.dist-info}/RECORD +26 -24
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- {pixeltable-0.4.8.dist-info → pixeltable-0.4.10.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.8.dist-info → pixeltable-0.4.10.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.8.dist-info → pixeltable-0.4.10.dist-info}/licenses/LICENSE +0 -0
pixeltable/__init__.py
CHANGED
|
@@ -30,7 +30,7 @@ from .globals import (
|
|
|
30
30
|
from .type_system import Array, Audio, Bool, Date, Document, Float, Image, Int, Json, Required, String, Timestamp, Video
|
|
31
31
|
|
|
32
32
|
# This import must go last to avoid circular imports.
|
|
33
|
-
from . import
|
|
33
|
+
from . import functions, io, iterators # isort: skip
|
|
34
34
|
|
|
35
35
|
# This is the safest / most maintainable way to construct __all__: start with the default and "blacklist"
|
|
36
36
|
# stuff that we don't want in there. (Using a "whitelist" is considerably harder to maintain.)
|
|
@@ -2,13 +2,17 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import enum
|
|
4
4
|
import logging
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Literal, Optional, overload
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, cast, overload
|
|
6
6
|
from uuid import UUID
|
|
7
7
|
|
|
8
|
+
import pydantic
|
|
9
|
+
import pydantic_core
|
|
10
|
+
|
|
8
11
|
import pixeltable as pxt
|
|
9
12
|
from pixeltable import exceptions as excs, type_system as ts
|
|
10
13
|
from pixeltable.env import Env
|
|
11
14
|
from pixeltable.utils.filecache import FileCache
|
|
15
|
+
from pixeltable.utils.pydantic import is_json_convertible
|
|
12
16
|
|
|
13
17
|
from .globals import MediaValidation
|
|
14
18
|
from .table import Table
|
|
@@ -137,8 +141,24 @@ class InsertableTable(Table):
|
|
|
137
141
|
from pixeltable.catalog import Catalog
|
|
138
142
|
from pixeltable.io.table_data_conduit import UnkTableDataConduit
|
|
139
143
|
|
|
144
|
+
if source is not None and isinstance(source, Sequence) and len(source) == 0:
|
|
145
|
+
raise excs.Error('Cannot insert an empty sequence')
|
|
146
|
+
fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
|
|
147
|
+
|
|
140
148
|
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
141
149
|
table = self
|
|
150
|
+
|
|
151
|
+
# TODO: unify with TableDataConduit
|
|
152
|
+
if source is not None and isinstance(source, Sequence) and isinstance(source[0], pydantic.BaseModel):
|
|
153
|
+
status = self._insert_pydantic(
|
|
154
|
+
cast(Sequence[pydantic.BaseModel], source), # needed for mypy
|
|
155
|
+
print_stats=print_stats,
|
|
156
|
+
fail_on_exception=fail_on_exception,
|
|
157
|
+
)
|
|
158
|
+
Env.get().console_logger.info(status.insert_msg)
|
|
159
|
+
FileCache.get().emit_eviction_warnings()
|
|
160
|
+
return status
|
|
161
|
+
|
|
142
162
|
if source is None:
|
|
143
163
|
source = [kwargs]
|
|
144
164
|
kwargs = None
|
|
@@ -154,7 +174,6 @@ class InsertableTable(Table):
|
|
|
154
174
|
data_source.add_table_info(table)
|
|
155
175
|
data_source.prepare_for_insert_into_table()
|
|
156
176
|
|
|
157
|
-
fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
|
|
158
177
|
return table.insert_table_data_source(
|
|
159
178
|
data_source=data_source, fail_on_exception=fail_on_exception, print_stats=print_stats
|
|
160
179
|
)
|
|
@@ -184,32 +203,110 @@ class InsertableTable(Table):
|
|
|
184
203
|
FileCache.get().emit_eviction_warnings()
|
|
185
204
|
return status
|
|
186
205
|
|
|
187
|
-
def
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
206
|
+
def _insert_pydantic(
|
|
207
|
+
self, rows: Sequence[pydantic.BaseModel], print_stats: bool = False, fail_on_exception: bool = True
|
|
208
|
+
) -> UpdateStatus:
|
|
209
|
+
model_class = type(rows[0])
|
|
210
|
+
self._validate_pydantic_model(model_class)
|
|
211
|
+
# convert rows one-by-one in order to be able to print meaningful error messages
|
|
212
|
+
pxt_rows: list[dict[str, Any]] = []
|
|
213
|
+
for i, row in enumerate(rows):
|
|
214
|
+
try:
|
|
215
|
+
pxt_rows.append(row.model_dump(mode='json'))
|
|
216
|
+
except pydantic_core.PydanticSerializationError as e:
|
|
217
|
+
raise excs.Error(f'Row {i}: error serializing pydantic model to JSON:\n{e!s}') from e
|
|
218
|
+
|
|
219
|
+
# explicitly check that all required columns are present and non-None in the rows,
|
|
220
|
+
# because we ignore nullability when validating the pydantic model
|
|
221
|
+
reqd_col_names = [col.name for col in self._tbl_version_path.columns() if col.is_required_for_insert]
|
|
222
|
+
for i, pxt_row in enumerate(pxt_rows):
|
|
223
|
+
if type(rows[i]) is not model_class:
|
|
224
|
+
raise excs.Error(
|
|
225
|
+
f'Expected {model_class.__name__!r} instance, got {type(rows[i]).__name__!r} (in row {i})'
|
|
226
|
+
)
|
|
227
|
+
for col_name in reqd_col_names:
|
|
228
|
+
if pxt_row.get(col_name) is None:
|
|
229
|
+
raise excs.Error(f'Missing required column {col_name!r} in row {i}')
|
|
230
|
+
|
|
231
|
+
status = self._tbl_version.get().insert(
|
|
232
|
+
rows=pxt_rows, df=None, print_stats=print_stats, fail_on_exception=fail_on_exception
|
|
233
|
+
)
|
|
234
|
+
return status
|
|
235
|
+
|
|
236
|
+
def _validate_pydantic_model(self, model: type[pydantic.BaseModel]) -> None:
|
|
237
|
+
"""
|
|
238
|
+
Check if a Pydantic model is compatible with this table for insert operations.
|
|
239
|
+
|
|
240
|
+
A model is compatible if:
|
|
241
|
+
- All required table columns have corresponding model fields with compatible types
|
|
242
|
+
- Model does not define fields for computed columns
|
|
243
|
+
- Model field types are compatible with table column types
|
|
244
|
+
"""
|
|
245
|
+
assert isinstance(model, type) and issubclass(model, pydantic.BaseModel)
|
|
246
|
+
|
|
247
|
+
schema = self._get_schema()
|
|
248
|
+
required_cols = set(self._tbl_version.get().get_required_col_names())
|
|
249
|
+
computed_cols = set(self._tbl_version.get().get_computed_col_names())
|
|
250
|
+
model_fields = model.model_fields
|
|
251
|
+
model_field_names = set(model_fields.keys())
|
|
252
|
+
|
|
253
|
+
missing_required = required_cols - model_field_names
|
|
254
|
+
if missing_required:
|
|
255
|
+
raise excs.Error(
|
|
256
|
+
f'Pydantic model {model.__name__!r} is missing required columns: '
|
|
257
|
+
f'{", ".join(f"{col_name!r}" for col_name in missing_required)}'
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
computed_in_model = computed_cols & model_field_names
|
|
261
|
+
if computed_in_model:
|
|
262
|
+
raise excs.Error(
|
|
263
|
+
f'Pydantic model {model.__name__!r} has fields for computed columns: '
|
|
264
|
+
f'{", ".join(f"{col_name!r}" for col_name in computed_in_model)}'
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# validate type compatibility
|
|
268
|
+
common_fields = model_field_names & set(schema.keys())
|
|
269
|
+
if len(common_fields) == 0:
|
|
270
|
+
raise excs.Error(
|
|
271
|
+
f'Pydantic model {model.__name__!r} has no fields that map to columns in table {self._name!r}'
|
|
272
|
+
)
|
|
273
|
+
for field_name in common_fields:
|
|
274
|
+
pxt_col_type = schema[field_name]
|
|
275
|
+
model_field = model_fields[field_name]
|
|
276
|
+
model_type = model_field.annotation
|
|
277
|
+
|
|
278
|
+
# we ignore nullability: we want to accept optional model fields for required table columns, as long as
|
|
279
|
+
# the model instances provide a non-null value
|
|
280
|
+
# allow_enum=True: model_dump(mode='json') converts enums to their values
|
|
281
|
+
inferred_pxt_type = ts.ColumnType.from_python_type(model_type, infer_pydantic_json=True)
|
|
282
|
+
if inferred_pxt_type is None:
|
|
283
|
+
raise excs.Error(
|
|
284
|
+
f'Pydantic model {model.__name__!r}: cannot infer Pixeltable type for column {field_name!r}'
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
if pxt_col_type.is_media_type():
|
|
288
|
+
# media types require file paths, either as str or Path
|
|
289
|
+
if not inferred_pxt_type.is_string_type():
|
|
290
|
+
raise excs.Error(
|
|
291
|
+
f"Column {field_name!r} requires a 'str' or 'Path' field in {model.__name__!r}, but it is "
|
|
292
|
+
f'{model_type.__name__!r}'
|
|
293
|
+
)
|
|
294
|
+
else:
|
|
295
|
+
if not pxt_col_type.is_supertype_of(inferred_pxt_type, ignore_nullable=True):
|
|
296
|
+
raise excs.Error(
|
|
297
|
+
f'Pydantic model {model.__name__!r} has incompatible type ({model_type.__name__}) '
|
|
298
|
+
f'for column {field_name!r} ({pxt_col_type})'
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
if (
|
|
302
|
+
isinstance(model_type, type)
|
|
303
|
+
and issubclass(model_type, pydantic.BaseModel)
|
|
304
|
+
and not is_json_convertible(model_type)
|
|
305
|
+
):
|
|
306
|
+
raise excs.Error(
|
|
307
|
+
f'Pydantic model {model.__name__!r} has field {field_name!r} with nested model '
|
|
308
|
+
f'{model_type.__name__!r}, which is not JSON-convertible'
|
|
309
|
+
)
|
|
213
310
|
|
|
214
311
|
def delete(self, where: Optional['exprs.Expr'] = None) -> UpdateStatus:
|
|
215
312
|
"""Delete rows in this table.
|
pixeltable/catalog/table.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
4
|
import builtins
|
|
5
|
+
import datetime
|
|
5
6
|
import json
|
|
6
7
|
import logging
|
|
7
8
|
from keyword import iskeyword as is_python_keyword
|
|
@@ -9,7 +10,6 @@ from pathlib import Path
|
|
|
9
10
|
from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Literal, Optional, TypedDict, overload
|
|
10
11
|
|
|
11
12
|
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
12
|
-
import datetime
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
15
|
import pandas as pd
|
|
@@ -1356,6 +1356,15 @@ class Table(SchemaObject):
|
|
|
1356
1356
|
Insert rows from a CSV file:
|
|
1357
1357
|
|
|
1358
1358
|
>>> tbl.insert(source='path/to/file.csv')
|
|
1359
|
+
|
|
1360
|
+
Insert Pydantic model instances into a table with two `pxt.Int` columns `a` and `b`:
|
|
1361
|
+
|
|
1362
|
+
>>> class MyModel(pydantic.BaseModel):
|
|
1363
|
+
... a: int
|
|
1364
|
+
... b: int
|
|
1365
|
+
...
|
|
1366
|
+
... models = [MyModel(a=1, b=2), MyModel(a=3, b=4)]
|
|
1367
|
+
... tbl.insert(models)
|
|
1359
1368
|
"""
|
|
1360
1369
|
raise NotImplementedError
|
|
1361
1370
|
|
pixeltable/config.py
CHANGED
|
@@ -167,6 +167,7 @@ KNOWN_CONFIG_OPTIONS = {
|
|
|
167
167
|
'deepseek': {'api_key': 'Deepseek API key', 'rate_limit': 'Rate limit for Deepseek API requests'},
|
|
168
168
|
'fireworks': {'api_key': 'Fireworks API key', 'rate_limit': 'Rate limit for Fireworks API requests'},
|
|
169
169
|
'gemini': {'api_key': 'Gemini API key', 'rate_limits': 'Per-model rate limits for Gemini API requests'},
|
|
170
|
+
'hf': {'auth_token': 'Hugging Face access token'},
|
|
170
171
|
'imagen': {'rate_limits': 'Per-model rate limits for Imagen API requests'},
|
|
171
172
|
'veo': {'rate_limits': 'Per-model rate limits for Veo API requests'},
|
|
172
173
|
'groq': {'api_key': 'Groq API key', 'rate_limit': 'Rate limit for Groq API requests'},
|
pixeltable/env.py
CHANGED
|
@@ -11,6 +11,7 @@ import logging
|
|
|
11
11
|
import os
|
|
12
12
|
import platform
|
|
13
13
|
import shutil
|
|
14
|
+
import subprocess
|
|
14
15
|
import sys
|
|
15
16
|
import threading
|
|
16
17
|
import types
|
|
@@ -27,6 +28,7 @@ import nest_asyncio # type: ignore[import-untyped]
|
|
|
27
28
|
import pixeltable_pgserver
|
|
28
29
|
import sqlalchemy as sql
|
|
29
30
|
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
31
|
+
from tenacity import retry, stop_after_attempt, wait_exponential_jitter
|
|
30
32
|
from tqdm import TqdmWarning
|
|
31
33
|
|
|
32
34
|
from pixeltable import exceptions as excs
|
|
@@ -81,6 +83,7 @@ class Env:
|
|
|
81
83
|
_file_cache_size_g: float
|
|
82
84
|
_pxt_api_key: Optional[str]
|
|
83
85
|
_stdout_handler: logging.StreamHandler
|
|
86
|
+
_default_video_encoder: str | None
|
|
84
87
|
_initialized: bool
|
|
85
88
|
|
|
86
89
|
_resource_pool_info: dict[str, Any]
|
|
@@ -104,10 +107,14 @@ class Env:
|
|
|
104
107
|
cls._instance._clean_up()
|
|
105
108
|
cls._instance = None
|
|
106
109
|
env = Env()
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
110
|
+
try:
|
|
111
|
+
env._set_up(reinit_db=reinit_db)
|
|
112
|
+
env._upgrade_metadata()
|
|
113
|
+
cls._instance = env
|
|
114
|
+
finally:
|
|
115
|
+
# Reset the initializing flag, even if setup fails.
|
|
116
|
+
# This prevents the environment from being left in a broken state.
|
|
117
|
+
cls.__initializing = False
|
|
111
118
|
|
|
112
119
|
def __init__(self) -> None:
|
|
113
120
|
assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
|
|
@@ -127,6 +134,7 @@ class Env:
|
|
|
127
134
|
self._spacy_nlp = None
|
|
128
135
|
self._httpd = None
|
|
129
136
|
self._http_address = None
|
|
137
|
+
self._default_video_encoder = None
|
|
130
138
|
|
|
131
139
|
# logging-related state
|
|
132
140
|
self._logger = logging.getLogger('pixeltable')
|
|
@@ -500,14 +508,24 @@ class Env:
|
|
|
500
508
|
assert self._db_url is not None
|
|
501
509
|
assert self._db_name is not None
|
|
502
510
|
|
|
511
|
+
@retry(
|
|
512
|
+
stop=stop_after_attempt(3), # Stop after 3 attempts
|
|
513
|
+
wait=wait_exponential_jitter(initial=0.2, max=1.0, jitter=0.2), # Exponential backoff with jitter
|
|
514
|
+
)
|
|
503
515
|
def _init_metadata(self) -> None:
|
|
504
516
|
"""
|
|
505
517
|
Create pixeltable metadata tables and system metadata.
|
|
506
518
|
This is an idempotent operation.
|
|
519
|
+
|
|
520
|
+
Retry logic handles race conditions when multiple Pixeltable processes
|
|
521
|
+
attempt to initialize metadata tables simultaneously. The first process may succeed
|
|
522
|
+
in creating tables while others encounter database constraints (e.g., "table already exists").
|
|
523
|
+
Exponential backoff with jitter reduces contention between competing processes.
|
|
507
524
|
"""
|
|
508
525
|
assert self._sa_engine is not None
|
|
509
526
|
from pixeltable import metadata
|
|
510
527
|
|
|
528
|
+
self._logger.debug('Creating pixeltable metadata')
|
|
511
529
|
metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
|
|
512
530
|
metadata.create_system_info(self._sa_engine)
|
|
513
531
|
|
|
@@ -662,6 +680,41 @@ class Env:
|
|
|
662
680
|
self._start_web_server()
|
|
663
681
|
self.__register_packages()
|
|
664
682
|
|
|
683
|
+
@property
|
|
684
|
+
def default_video_encoder(self) -> str | None:
|
|
685
|
+
if self._default_video_encoder is None:
|
|
686
|
+
self._default_video_encoder = self._determine_default_video_encoder()
|
|
687
|
+
return self._default_video_encoder
|
|
688
|
+
|
|
689
|
+
def _determine_default_video_encoder(self) -> str | None:
|
|
690
|
+
"""
|
|
691
|
+
Returns the first available encoder from a list of candidates.
|
|
692
|
+
|
|
693
|
+
TODO:
|
|
694
|
+
- the user might prefer a hardware-accelerated encoder (eg, h264_nvenc or h264_videotoolbox)
|
|
695
|
+
- allow user override via a config option 'video_encoder'
|
|
696
|
+
"""
|
|
697
|
+
# look for available encoders, in this order
|
|
698
|
+
candidates = [
|
|
699
|
+
'libx264', # GPL, best quality
|
|
700
|
+
'libopenh264', # BSD
|
|
701
|
+
]
|
|
702
|
+
|
|
703
|
+
try:
|
|
704
|
+
# Get list of available encoders
|
|
705
|
+
result = subprocess.run(['ffmpeg', '-encoders'], capture_output=True, text=True, timeout=10, check=True)
|
|
706
|
+
|
|
707
|
+
if result.returncode == 0:
|
|
708
|
+
available_encoders = result.stdout
|
|
709
|
+
for encoder in candidates:
|
|
710
|
+
# ffmpeg -encoders output format: " V..... encoder_name description"
|
|
711
|
+
if f' {encoder} ' in available_encoders:
|
|
712
|
+
_logger.debug(f'Using H.264 encoder: {encoder}')
|
|
713
|
+
return encoder
|
|
714
|
+
except Exception:
|
|
715
|
+
pass
|
|
716
|
+
return None
|
|
717
|
+
|
|
665
718
|
def __register_packages(self) -> None:
|
|
666
719
|
"""Declare optional packages that are utilized by some parts of the code."""
|
|
667
720
|
self.__register_package('anthropic')
|
pixeltable/functions/__init__.py
CHANGED
pixeltable/functions/audio.py
CHANGED
|
@@ -3,6 +3,7 @@ Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs)
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import pixeltable as pxt
|
|
6
|
+
import pixeltable.utils.av as av_utils
|
|
6
7
|
from pixeltable.utils.code import local_public_names
|
|
7
8
|
|
|
8
9
|
|
|
@@ -47,7 +48,7 @@ def get_metadata(audio: pxt.Audio) -> dict:
|
|
|
47
48
|
|
|
48
49
|
>>> tbl.select(tbl.audio_col.get_metadata()).collect()
|
|
49
50
|
"""
|
|
50
|
-
return
|
|
51
|
+
return av_utils.get_metadata(audio)
|
|
51
52
|
|
|
52
53
|
|
|
53
54
|
__all__ = local_public_names(__name__)
|
pixeltable/functions/gemini.py
CHANGED
|
@@ -14,6 +14,7 @@ import PIL.Image
|
|
|
14
14
|
|
|
15
15
|
import pixeltable as pxt
|
|
16
16
|
from pixeltable import env, exceptions as excs, exprs
|
|
17
|
+
from pixeltable.utils.code import local_public_names
|
|
17
18
|
from pixeltable.utils.media_store import TempStore
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
@@ -232,3 +233,10 @@ async def generate_videos(
|
|
|
232
233
|
@generate_videos.resource_pool
|
|
233
234
|
def _(model: str) -> str:
|
|
234
235
|
return f'request-rate:veo:{model}'
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
__all__ = local_public_names(__name__)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def __dir__() -> list[str]:
|
|
242
|
+
return __all__
|