pixeltable 0.4.7__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/catalog/catalog.py +4 -6
- pixeltable/catalog/insertable_table.py +125 -28
- pixeltable/catalog/table.py +51 -15
- pixeltable/catalog/table_version.py +12 -8
- pixeltable/catalog/table_version_path.py +6 -5
- pixeltable/config.py +25 -9
- pixeltable/dataframe.py +3 -3
- pixeltable/env.py +89 -20
- pixeltable/exec/aggregation_node.py +1 -1
- pixeltable/exec/cache_prefetch_node.py +4 -3
- pixeltable/exec/exec_node.py +0 -8
- pixeltable/exec/expr_eval/globals.py +1 -0
- pixeltable/exec/expr_eval/schedulers.py +16 -4
- pixeltable/exec/in_memory_data_node.py +2 -3
- pixeltable/exprs/data_row.py +5 -5
- pixeltable/exprs/function_call.py +59 -21
- pixeltable/exprs/row_builder.py +11 -5
- pixeltable/func/expr_template_function.py +6 -3
- pixeltable/functions/__init__.py +2 -0
- pixeltable/functions/anthropic.py +1 -2
- pixeltable/functions/deepseek.py +5 -1
- pixeltable/functions/gemini.py +11 -2
- pixeltable/functions/huggingface.py +6 -12
- pixeltable/functions/openai.py +2 -1
- pixeltable/functions/video.py +5 -5
- pixeltable/functions/whisperx.py +177 -0
- pixeltable/{ext/functions → functions}/yolox.py +0 -4
- pixeltable/globals.py +16 -3
- pixeltable/io/fiftyone.py +3 -3
- pixeltable/io/label_studio.py +2 -1
- pixeltable/iterators/audio.py +3 -2
- pixeltable/iterators/document.py +0 -6
- pixeltable/metadata/__init__.py +3 -1
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +0 -16
- pixeltable/share/packager.py +6 -6
- pixeltable/share/publish.py +134 -7
- pixeltable/type_system.py +20 -4
- pixeltable/utils/media_store.py +131 -66
- pixeltable/utils/pydantic.py +60 -0
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/METADATA +186 -121
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/RECORD +47 -46
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/licenses/LICENSE +0 -0
pixeltable/env.py
CHANGED
|
@@ -15,7 +15,6 @@ import sys
|
|
|
15
15
|
import threading
|
|
16
16
|
import types
|
|
17
17
|
import typing
|
|
18
|
-
import uuid
|
|
19
18
|
import warnings
|
|
20
19
|
from contextlib import contextmanager
|
|
21
20
|
from dataclasses import dataclass, field
|
|
@@ -28,6 +27,7 @@ import nest_asyncio # type: ignore[import-untyped]
|
|
|
28
27
|
import pixeltable_pgserver
|
|
29
28
|
import sqlalchemy as sql
|
|
30
29
|
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
30
|
+
from tenacity import retry, stop_after_attempt, wait_exponential_jitter
|
|
31
31
|
from tqdm import TqdmWarning
|
|
32
32
|
|
|
33
33
|
from pixeltable import exceptions as excs
|
|
@@ -101,12 +101,18 @@ class Env:
|
|
|
101
101
|
def _init_env(cls, reinit_db: bool = False) -> None:
|
|
102
102
|
assert not cls.__initializing, 'Circular env initialization detected.'
|
|
103
103
|
cls.__initializing = True
|
|
104
|
+
if cls._instance is not None:
|
|
105
|
+
cls._instance._clean_up()
|
|
104
106
|
cls._instance = None
|
|
105
107
|
env = Env()
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
108
|
+
try:
|
|
109
|
+
env._set_up(reinit_db=reinit_db)
|
|
110
|
+
env._upgrade_metadata()
|
|
111
|
+
cls._instance = env
|
|
112
|
+
finally:
|
|
113
|
+
# Reset the initializing flag, even if setup fails.
|
|
114
|
+
# This prevents the environment from being left in a broken state.
|
|
115
|
+
cls.__initializing = False
|
|
110
116
|
|
|
111
117
|
def __init__(self) -> None:
|
|
112
118
|
assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
|
|
@@ -246,7 +252,7 @@ class Env:
|
|
|
246
252
|
if self._current_conn is None:
|
|
247
253
|
assert self._current_session is None
|
|
248
254
|
try:
|
|
249
|
-
self._current_isolation_level = 'SERIALIZABLE'
|
|
255
|
+
self._current_isolation_level = 'SERIALIZABLE'
|
|
250
256
|
with (
|
|
251
257
|
self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
|
|
252
258
|
sql.orm.Session(conn) as session,
|
|
@@ -485,7 +491,7 @@ class Env:
|
|
|
485
491
|
raise excs.Error(error)
|
|
486
492
|
self._logger.info(f'Using database at: {self.db_url}')
|
|
487
493
|
else:
|
|
488
|
-
self._db_name =
|
|
494
|
+
self._db_name = config.get_string_value('db') or 'pixeltable'
|
|
489
495
|
self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(Config.get().home / 'pgdata')))
|
|
490
496
|
# cleanup_mode=None will leave the postgres process running after Python exits
|
|
491
497
|
# cleanup_mode='stop' will terminate the postgres process when Python exits
|
|
@@ -499,14 +505,24 @@ class Env:
|
|
|
499
505
|
assert self._db_url is not None
|
|
500
506
|
assert self._db_name is not None
|
|
501
507
|
|
|
508
|
+
@retry(
|
|
509
|
+
stop=stop_after_attempt(3), # Stop after 3 attempts
|
|
510
|
+
wait=wait_exponential_jitter(initial=0.2, max=1.0, jitter=0.2), # Exponential backoff with jitter
|
|
511
|
+
)
|
|
502
512
|
def _init_metadata(self) -> None:
|
|
503
513
|
"""
|
|
504
514
|
Create pixeltable metadata tables and system metadata.
|
|
505
515
|
This is an idempotent operation.
|
|
516
|
+
|
|
517
|
+
Retry logic handles race conditions when multiple Pixeltable processes
|
|
518
|
+
attempt to initialize metadata tables simultaneously. The first process may succeed
|
|
519
|
+
in creating tables while others encounter database constraints (e.g., "table already exists").
|
|
520
|
+
Exponential backoff with jitter reduces contention between competing processes.
|
|
506
521
|
"""
|
|
507
522
|
assert self._sa_engine is not None
|
|
508
523
|
from pixeltable import metadata
|
|
509
524
|
|
|
525
|
+
self._logger.debug('Creating pixeltable metadata')
|
|
510
526
|
metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
|
|
511
527
|
metadata.create_system_info(self._sa_engine)
|
|
512
528
|
|
|
@@ -557,6 +573,14 @@ class Env:
|
|
|
557
573
|
finally:
|
|
558
574
|
engine.dispose()
|
|
559
575
|
|
|
576
|
+
def _pgserver_terminate_connections_stmt(self) -> str:
|
|
577
|
+
return f"""
|
|
578
|
+
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
579
|
+
FROM pg_stat_activity
|
|
580
|
+
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
581
|
+
AND pid <> pg_backend_pid()
|
|
582
|
+
"""
|
|
583
|
+
|
|
560
584
|
def _drop_store_db(self) -> None:
|
|
561
585
|
assert self._db_name is not None
|
|
562
586
|
engine = sql.create_engine(self._dbms.default_system_db_url(), future=True, isolation_level='AUTOCOMMIT')
|
|
@@ -565,13 +589,7 @@ class Env:
|
|
|
565
589
|
with engine.begin() as conn:
|
|
566
590
|
# terminate active connections
|
|
567
591
|
if self._db_server is not None:
|
|
568
|
-
|
|
569
|
-
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
570
|
-
FROM pg_stat_activity
|
|
571
|
-
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
572
|
-
AND pid <> pg_backend_pid()
|
|
573
|
-
"""
|
|
574
|
-
conn.execute(sql.text(stmt))
|
|
592
|
+
conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
|
|
575
593
|
# drop db
|
|
576
594
|
stmt = self._dbms.drop_db_stmt(preparer.quote(self._db_name))
|
|
577
595
|
conn.execute(sql.text(stmt))
|
|
@@ -749,12 +767,6 @@ class Env:
|
|
|
749
767
|
else:
|
|
750
768
|
os.remove(path)
|
|
751
769
|
|
|
752
|
-
def num_tmp_files(self) -> int:
|
|
753
|
-
return len(glob.glob(f'{self._tmp_dir}/*'))
|
|
754
|
-
|
|
755
|
-
def create_tmp_path(self, extension: str = '') -> Path:
|
|
756
|
-
return self._tmp_dir / f'{uuid.uuid4()}{extension}'
|
|
757
|
-
|
|
758
770
|
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
|
|
759
771
|
def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
|
|
760
772
|
"""Returns the info object for the given id, creating it if necessary."""
|
|
@@ -815,6 +827,63 @@ class Env:
|
|
|
815
827
|
except Exception as exc:
|
|
816
828
|
raise excs.Error(f'Failed to load spaCy model: {spacy_model}') from exc
|
|
817
829
|
|
|
830
|
+
def _clean_up(self) -> None:
|
|
831
|
+
"""
|
|
832
|
+
Internal cleanup method that properly closes all resources and resets state.
|
|
833
|
+
This is called before destroying the singleton instance.
|
|
834
|
+
"""
|
|
835
|
+
assert self._current_session is None
|
|
836
|
+
assert self._current_conn is None
|
|
837
|
+
|
|
838
|
+
# Stop HTTP server
|
|
839
|
+
if self._httpd is not None:
|
|
840
|
+
try:
|
|
841
|
+
self._httpd.shutdown()
|
|
842
|
+
self._httpd.server_close()
|
|
843
|
+
except Exception as e:
|
|
844
|
+
_logger.warning(f'Error stopping HTTP server: {e}')
|
|
845
|
+
|
|
846
|
+
# First terminate all connections to the database
|
|
847
|
+
if self._db_server is not None:
|
|
848
|
+
assert self._dbms is not None
|
|
849
|
+
assert self._db_name is not None
|
|
850
|
+
try:
|
|
851
|
+
temp_engine = sql.create_engine(self._dbms.default_system_db_url(), isolation_level='AUTOCOMMIT')
|
|
852
|
+
try:
|
|
853
|
+
with temp_engine.begin() as conn:
|
|
854
|
+
conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
|
|
855
|
+
_logger.info(f"Terminated all connections to database '{self._db_name}'")
|
|
856
|
+
except Exception as e:
|
|
857
|
+
_logger.warning(f'Error terminating database connections: {e}')
|
|
858
|
+
finally:
|
|
859
|
+
temp_engine.dispose()
|
|
860
|
+
except Exception as e:
|
|
861
|
+
_logger.warning(f'Error stopping database server: {e}')
|
|
862
|
+
|
|
863
|
+
# Dispose of SQLAlchemy engine (after stopping db server)
|
|
864
|
+
if self._sa_engine is not None:
|
|
865
|
+
try:
|
|
866
|
+
self._sa_engine.dispose()
|
|
867
|
+
except Exception as e:
|
|
868
|
+
_logger.warning(f'Error disposing engine: {e}')
|
|
869
|
+
|
|
870
|
+
# Close event loop
|
|
871
|
+
if self._event_loop is not None:
|
|
872
|
+
try:
|
|
873
|
+
if self._event_loop.is_running():
|
|
874
|
+
self._event_loop.stop()
|
|
875
|
+
self._event_loop.close()
|
|
876
|
+
except Exception as e:
|
|
877
|
+
_logger.warning(f'Error closing event loop: {e}')
|
|
878
|
+
|
|
879
|
+
# Remove logging handlers
|
|
880
|
+
for handler in self._logger.handlers[:]:
|
|
881
|
+
try:
|
|
882
|
+
handler.close()
|
|
883
|
+
self._logger.removeHandler(handler)
|
|
884
|
+
except Exception as e:
|
|
885
|
+
_logger.warning(f'Error removing handler: {e}')
|
|
886
|
+
|
|
818
887
|
|
|
819
888
|
def register_client(name: str) -> Callable:
|
|
820
889
|
"""Decorator that registers a third-party API client for use by Pixeltable.
|
|
@@ -103,6 +103,6 @@ class AggregationNode(ExecNode):
|
|
|
103
103
|
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
104
104
|
self.output_batch.add_row(prev_row)
|
|
105
105
|
|
|
106
|
-
self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
|
|
106
|
+
self.output_batch.flush_imgs(None, self.row_builder.stored_img_cols, self.flushed_img_slots)
|
|
107
107
|
_logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
|
|
108
108
|
yield self.output_batch
|
|
@@ -12,8 +12,9 @@ from pathlib import Path
|
|
|
12
12
|
from typing import Any, AsyncIterator, Iterator, Optional
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
|
-
from pixeltable import
|
|
15
|
+
from pixeltable import exceptions as excs, exprs
|
|
16
16
|
from pixeltable.utils.filecache import FileCache
|
|
17
|
+
from pixeltable.utils.media_store import TempStore
|
|
17
18
|
|
|
18
19
|
from .data_row_batch import DataRowBatch
|
|
19
20
|
from .exec_node import ExecNode
|
|
@@ -219,7 +220,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
219
220
|
self.in_flight_requests[f] = url
|
|
220
221
|
|
|
221
222
|
def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
|
|
222
|
-
"""Fetches a remote URL into
|
|
223
|
+
"""Fetches a remote URL into the TempStore and returns its path"""
|
|
223
224
|
_logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
|
|
224
225
|
parsed = urllib.parse.urlparse(url)
|
|
225
226
|
# Use len(parsed.scheme) > 1 here to ensure we're not being passed
|
|
@@ -230,7 +231,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
230
231
|
if parsed.path:
|
|
231
232
|
p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
|
|
232
233
|
extension = p.suffix
|
|
233
|
-
tmp_path =
|
|
234
|
+
tmp_path = TempStore.create_path(extension=extension)
|
|
234
235
|
try:
|
|
235
236
|
_logger.debug(f'Downloading {url} to {tmp_path}')
|
|
236
237
|
if parsed.scheme == 's3':
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -20,7 +20,6 @@ class ExecNode(abc.ABC):
|
|
|
20
20
|
row_builder: exprs.RowBuilder
|
|
21
21
|
input: Optional[ExecNode]
|
|
22
22
|
flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
|
|
23
|
-
stored_img_cols: list[exprs.ColumnSlotIdx]
|
|
24
23
|
ctx: Optional[ExecContext]
|
|
25
24
|
|
|
26
25
|
def __init__(
|
|
@@ -40,7 +39,6 @@ class ExecNode(abc.ABC):
|
|
|
40
39
|
self.flushed_img_slots = [
|
|
41
40
|
e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
42
41
|
]
|
|
43
|
-
self.stored_img_cols = []
|
|
44
42
|
self.ctx = None # all nodes of a tree share the same context
|
|
45
43
|
|
|
46
44
|
def set_ctx(self, ctx: ExecContext) -> None:
|
|
@@ -48,12 +46,6 @@ class ExecNode(abc.ABC):
|
|
|
48
46
|
if self.input is not None:
|
|
49
47
|
self.input.set_ctx(ctx)
|
|
50
48
|
|
|
51
|
-
def set_stored_img_cols(self, stored_img_cols: list[exprs.ColumnSlotIdx]) -> None:
|
|
52
|
-
self.stored_img_cols = stored_img_cols
|
|
53
|
-
# propagate batch size to the source
|
|
54
|
-
if self.input is not None:
|
|
55
|
-
self.input.set_stored_img_cols(stored_img_cols)
|
|
56
|
-
|
|
57
49
|
@abc.abstractmethod
|
|
58
50
|
def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
59
51
|
pass
|
|
@@ -56,6 +56,7 @@ class Scheduler(abc.ABC):
|
|
|
56
56
|
request: FnCallArgs
|
|
57
57
|
num_retries: int
|
|
58
58
|
exec_ctx: ExecCtx
|
|
59
|
+
retry_after: Optional[float] = None # time.monotonic()
|
|
59
60
|
|
|
60
61
|
def __lt__(self, other: Scheduler.QueueItem) -> bool:
|
|
61
62
|
# prioritize by number of retries (more retries = higher priority)
|
|
@@ -270,6 +270,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
270
270
|
num_in_flight: int
|
|
271
271
|
total_requests: int
|
|
272
272
|
total_retried: int
|
|
273
|
+
total_errors: int
|
|
273
274
|
|
|
274
275
|
TIME_FORMAT = '%H:%M.%S %f'
|
|
275
276
|
MAX_RETRIES = 3
|
|
@@ -294,6 +295,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
294
295
|
self.num_in_flight = 0
|
|
295
296
|
self.total_requests = 0
|
|
296
297
|
self.total_retried = 0
|
|
298
|
+
self.total_errors = 0
|
|
297
299
|
|
|
298
300
|
# try to get the rate limit from the config
|
|
299
301
|
elems = resource_pool.split(':')
|
|
@@ -312,6 +314,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
312
314
|
key = model
|
|
313
315
|
requests_per_min = Config.get().get_int_value(key, section=section)
|
|
314
316
|
requests_per_min = requests_per_min or self.DEFAULT_RATE_LIMIT
|
|
317
|
+
_logger.debug(f'rate limit for {self.resource_pool}: {requests_per_min} RPM')
|
|
315
318
|
self.secs_per_request = 1 / (requests_per_min / 60)
|
|
316
319
|
|
|
317
320
|
@classmethod
|
|
@@ -325,8 +328,12 @@ class RequestRateScheduler(Scheduler):
|
|
|
325
328
|
if item.num_retries > 0:
|
|
326
329
|
self.total_retried += 1
|
|
327
330
|
now = time.monotonic()
|
|
331
|
+
wait_duration = 0.0
|
|
332
|
+
if item.retry_after is not None:
|
|
333
|
+
wait_duration = item.retry_after - now
|
|
328
334
|
if now - last_request_ts < self.secs_per_request:
|
|
329
|
-
wait_duration = self.secs_per_request - (now - last_request_ts)
|
|
335
|
+
wait_duration = max(wait_duration, self.secs_per_request - (now - last_request_ts))
|
|
336
|
+
if wait_duration > 0:
|
|
330
337
|
_logger.debug(f'waiting for {wait_duration} for {self.resource_pool}')
|
|
331
338
|
await asyncio.sleep(wait_duration)
|
|
332
339
|
|
|
@@ -372,15 +379,20 @@ class RequestRateScheduler(Scheduler):
|
|
|
372
379
|
|
|
373
380
|
except Exception as exc:
|
|
374
381
|
_logger.debug(f'exception for {self.resource_pool}: type={type(exc)}\n{exc}')
|
|
382
|
+
if hasattr(exc, 'response') and hasattr(exc.response, 'headers'):
|
|
383
|
+
_logger.debug(f'scheduler {self.resource_pool}: exception headers: {exc.response.headers}')
|
|
375
384
|
is_rate_limit_error, retry_after = self._is_rate_limit_error(exc)
|
|
376
385
|
if is_rate_limit_error and num_retries < self.MAX_RETRIES:
|
|
377
386
|
retry_delay = self._compute_retry_delay(num_retries, retry_after)
|
|
378
387
|
_logger.debug(f'scheduler {self.resource_pool}: retrying after {retry_delay}')
|
|
379
|
-
|
|
380
|
-
|
|
388
|
+
now = time.monotonic()
|
|
389
|
+
# put the request back in the queue right away, which prevents new requests from being generated until
|
|
390
|
+
# this one succeeds or exceeds its retry limit
|
|
391
|
+
self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx, retry_after=now + retry_delay))
|
|
381
392
|
return
|
|
382
393
|
|
|
383
394
|
# record the exception
|
|
395
|
+
self.total_errors += 1
|
|
384
396
|
_, _, exc_tb = sys.exc_info()
|
|
385
397
|
for row in request.rows:
|
|
386
398
|
row.set_exc(request.fn_call.slot_idx, exc)
|
|
@@ -388,7 +400,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
388
400
|
finally:
|
|
389
401
|
_logger.debug(
|
|
390
402
|
f'Scheduler stats: #in-flight={self.num_in_flight} #requests={self.total_requests}, '
|
|
391
|
-
f'#retried={self.total_retried}'
|
|
403
|
+
f'#retried={self.total_retried} #errors={self.total_errors}'
|
|
392
404
|
)
|
|
393
405
|
if is_task:
|
|
394
406
|
self.num_in_flight -= 1
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
from typing import Any, AsyncIterator, Optional
|
|
3
3
|
|
|
4
4
|
from pixeltable import catalog, exprs
|
|
5
|
-
from pixeltable.utils.media_store import
|
|
5
|
+
from pixeltable.utils.media_store import TempStore
|
|
6
6
|
|
|
7
7
|
from .data_row_batch import DataRowBatch
|
|
8
8
|
from .exec_node import ExecNode
|
|
@@ -67,8 +67,7 @@ class InMemoryDataNode(ExecNode):
|
|
|
67
67
|
col = col_info.col
|
|
68
68
|
if col.col_type.is_image_type() and isinstance(val, bytes):
|
|
69
69
|
# this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
|
|
70
|
-
|
|
71
|
-
filepath, _ = MediaStore.save_media_object(val, col, format=None)
|
|
70
|
+
filepath, _ = TempStore.save_media_object(val, col, format=None)
|
|
72
71
|
output_row[col_info.slot_idx] = str(filepath)
|
|
73
72
|
else:
|
|
74
73
|
output_row[col_info.slot_idx] = val
|
pixeltable/exprs/data_row.py
CHANGED
|
@@ -14,7 +14,7 @@ import PIL.Image
|
|
|
14
14
|
import sqlalchemy as sql
|
|
15
15
|
|
|
16
16
|
from pixeltable import catalog, env
|
|
17
|
-
from pixeltable.utils.media_store import MediaStore
|
|
17
|
+
from pixeltable.utils.media_store import MediaStore, TempStore
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class DataRow:
|
|
@@ -270,7 +270,7 @@ class DataRow:
|
|
|
270
270
|
# Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
|
|
271
271
|
# In that case, use WebP instead.
|
|
272
272
|
format = 'webp' if image.has_transparency_data else 'jpeg'
|
|
273
|
-
filepath, url = MediaStore.save_media_object(image, col, format=format)
|
|
273
|
+
filepath, url = MediaStore.get().save_media_object(image, col, format=format)
|
|
274
274
|
self.file_paths[index] = str(filepath)
|
|
275
275
|
self.file_urls[index] = url
|
|
276
276
|
else:
|
|
@@ -282,16 +282,16 @@ class DataRow:
|
|
|
282
282
|
self.vals[index] = None
|
|
283
283
|
|
|
284
284
|
def move_tmp_media_file(self, index: int, col: catalog.Column) -> None:
|
|
285
|
-
"""If a media url refers to data in a temporary file, move the data to
|
|
285
|
+
"""If a media url refers to data in a temporary file, move the data to a MediaStore"""
|
|
286
286
|
if self.file_urls[index] is None:
|
|
287
287
|
return
|
|
288
288
|
assert self.excs[index] is None
|
|
289
289
|
assert col.col_type.is_media_type()
|
|
290
|
-
src_path =
|
|
290
|
+
src_path = TempStore.resolve_url(self.file_urls[index])
|
|
291
291
|
if src_path is None:
|
|
292
292
|
# The media url does not point to a temporary file, leave it as is
|
|
293
293
|
return
|
|
294
|
-
new_file_url = MediaStore.relocate_local_media_file(src_path, col)
|
|
294
|
+
new_file_url = MediaStore.get().relocate_local_media_file(src_path, col)
|
|
295
295
|
self.file_urls[index] = new_file_url
|
|
296
296
|
|
|
297
297
|
@property
|
|
@@ -115,6 +115,7 @@ class FunctionCall(Expr):
|
|
|
115
115
|
self._validation_error = validation_error
|
|
116
116
|
|
|
117
117
|
if validation_error is not None:
|
|
118
|
+
self.bound_idxs = {}
|
|
118
119
|
self.resource_pool = None
|
|
119
120
|
return
|
|
120
121
|
|
|
@@ -300,8 +301,16 @@ class FunctionCall(Expr):
|
|
|
300
301
|
"""
|
|
301
302
|
res = super().substitute(spec)
|
|
302
303
|
assert res is self
|
|
303
|
-
|
|
304
|
-
|
|
304
|
+
if self.is_valid:
|
|
305
|
+
# If this FunctionCall is valid, re-evaluate the call_return_type of the substituted expression. If the
|
|
306
|
+
# FunctionCall is not valid, it isn't safe to do this. (Really we should be asserting that it *is* valid,
|
|
307
|
+
# but we still need to be able to do substitutions on invalid FunctionCalls, because loading an
|
|
308
|
+
# EmbeddingIndex from the db involves reconstructing the requisite (substituted) FunctionCalls. We could
|
|
309
|
+
# fix this by separately persisting the FunctionCall instances held by EmbeddingIndex to the db. That's
|
|
310
|
+
# probably a good idea, but it's also probably not urgent, since it only affects Functions that have a
|
|
311
|
+
# conditional_return_type implemented.)
|
|
312
|
+
self.return_type = self.fn.call_return_type(self.bound_args)
|
|
313
|
+
self.col_type = self.return_type
|
|
305
314
|
return self
|
|
306
315
|
|
|
307
316
|
def update(self, data_row: DataRow) -> None:
|
|
@@ -480,25 +489,54 @@ class FunctionCall(Expr):
|
|
|
480
489
|
).strip()
|
|
481
490
|
else:
|
|
482
491
|
# Evaluate the call_return_type as defined in the current codebase.
|
|
483
|
-
call_return_type =
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
#
|
|
487
|
-
#
|
|
488
|
-
#
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
492
|
+
call_return_type: Optional[ts.ColumnType] = None
|
|
493
|
+
|
|
494
|
+
if isinstance(resolved_fn, func.ExprTemplateFunction) and not resolved_fn.template.expr.is_valid:
|
|
495
|
+
# The FunctionCall is based on an ExprTemplateFunction, but the template expression is not valid
|
|
496
|
+
# (because it in turn contains an invalid FunctionCall). In this case, inherit the validation error
|
|
497
|
+
# from the template expression.
|
|
498
|
+
validation_error = resolved_fn.template.expr.validation_error
|
|
499
|
+
else:
|
|
500
|
+
try:
|
|
501
|
+
call_return_type = resolved_fn.call_return_type(bound_args)
|
|
502
|
+
except ImportError as exc:
|
|
503
|
+
validation_error = dedent(
|
|
504
|
+
f"""
|
|
505
|
+
A UDF call to {fn.self_path!r} could not be fully resolved, because a module required
|
|
506
|
+
by the UDF could not be imported:
|
|
507
|
+
{exc}
|
|
508
|
+
"""
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
assert (call_return_type is None) != (validation_error is None)
|
|
512
|
+
|
|
513
|
+
if call_return_type is None and return_type is None:
|
|
514
|
+
# Schema versions prior to 25 did not store the return_type in metadata, and there is no obvious
|
|
515
|
+
# way to infer it during DB migration, so we might encounter a stored return_type of None. If the
|
|
516
|
+
# resolution of call_return_type also fails, then we're out of luck; we have no choice but to
|
|
517
|
+
# fail-fast.
|
|
518
|
+
raise excs.Error(validation_error)
|
|
519
|
+
|
|
520
|
+
if call_return_type is not None:
|
|
521
|
+
# call_return_type resolution succeeded.
|
|
522
|
+
if return_type is None:
|
|
523
|
+
# Schema versions prior to 25 did not store the return_type in metadata (as mentioned above), so
|
|
524
|
+
# fall back on the call_return_type.
|
|
525
|
+
return_type = call_return_type
|
|
526
|
+
elif not return_type.is_supertype_of(call_return_type, ignore_nullable=True):
|
|
527
|
+
# There is a return_type stored in metadata (schema version >= 25),
|
|
528
|
+
# and the stored return_type of the UDF call doesn't match the column type of the FunctionCall.
|
|
529
|
+
validation_error = dedent(
|
|
530
|
+
f"""
|
|
531
|
+
The return type stored in the database for a UDF call to {fn.self_path!r} no longer
|
|
532
|
+
matches its return type as currently defined in the code. This probably means that the
|
|
533
|
+
code for {fn.self_path!r} has changed in a backward-incompatible way.
|
|
534
|
+
Return type of UDF call in the database: {return_type}
|
|
535
|
+
Return type of UDF as currently defined in code: {call_return_type}
|
|
536
|
+
"""
|
|
537
|
+
).strip()
|
|
538
|
+
|
|
539
|
+
assert return_type is not None # Guaranteed by the above logic.
|
|
502
540
|
|
|
503
541
|
fn_call = cls(
|
|
504
542
|
resolved_fn,
|
pixeltable/exprs/row_builder.py
CHANGED
|
@@ -86,6 +86,8 @@ class RowBuilder:
|
|
|
86
86
|
img_slot_idxs: list[int] # Indices of image slots
|
|
87
87
|
media_slot_idxs: list[int] # Indices of non-image media slots
|
|
88
88
|
array_slot_idxs: list[int] # Indices of array slots
|
|
89
|
+
stored_img_cols: list[exprs.ColumnSlotIdx]
|
|
90
|
+
stored_media_cols: list[exprs.ColumnSlotIdx]
|
|
89
91
|
|
|
90
92
|
@dataclass
|
|
91
93
|
class EvalCtx:
|
|
@@ -112,6 +114,8 @@ class RowBuilder:
|
|
|
112
114
|
"""
|
|
113
115
|
self.unique_exprs: ExprSet[Expr] = ExprSet() # dependencies precede their dependents
|
|
114
116
|
self.next_slot_idx = 0
|
|
117
|
+
self.stored_img_cols = []
|
|
118
|
+
self.stored_media_cols = []
|
|
115
119
|
|
|
116
120
|
# record input and output exprs; make copies to avoid reusing execution state
|
|
117
121
|
unique_input_exprs = [self._record_unique_expr(e.copy(), recursive=False) for e in input_exprs]
|
|
@@ -246,11 +250,13 @@ class RowBuilder:
|
|
|
246
250
|
def add_table_column(self, col: catalog.Column, slot_idx: int) -> None:
|
|
247
251
|
"""Record a column that is part of the table row"""
|
|
248
252
|
assert self.tbl is not None
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
253
|
+
assert col.is_stored
|
|
254
|
+
info = ColumnSlotIdx(col, slot_idx)
|
|
255
|
+
self.table_columns.append(info)
|
|
256
|
+
if col.col_type.is_media_type():
|
|
257
|
+
self.stored_media_cols.append(info)
|
|
258
|
+
if col.col_type.is_image_type():
|
|
259
|
+
self.stored_img_cols.append(info)
|
|
254
260
|
|
|
255
261
|
@property
|
|
256
262
|
def num_materialized(self) -> int:
|
|
@@ -85,13 +85,16 @@ class ExprTemplateFunction(Function):
|
|
|
85
85
|
conditional_return_type).
|
|
86
86
|
"""
|
|
87
87
|
assert not self.is_polymorphic
|
|
88
|
-
template = self.template
|
|
89
88
|
with_defaults = bound_args.copy()
|
|
90
89
|
with_defaults.update(
|
|
91
|
-
{
|
|
90
|
+
{
|
|
91
|
+
param_name: default
|
|
92
|
+
for param_name, default in self.template.defaults.items()
|
|
93
|
+
if param_name not in bound_args
|
|
94
|
+
}
|
|
92
95
|
)
|
|
93
96
|
substituted_expr = self.template.expr.copy().substitute(
|
|
94
|
-
{template.param_exprs[name]: expr for name, expr in with_defaults.items()}
|
|
97
|
+
{self.template.param_exprs[name]: expr for name, expr in with_defaults.items()}
|
|
95
98
|
)
|
|
96
99
|
return substituted_expr.col_type
|
|
97
100
|
|
pixeltable/functions/__init__.py
CHANGED
|
@@ -132,8 +132,7 @@ class AnthropicRateLimitsInfo(env.RateLimitsInfo):
|
|
|
132
132
|
should_retry_str = exc.response.headers.get('x-should-retry', '')
|
|
133
133
|
if should_retry_str.lower() != 'true':
|
|
134
134
|
return None
|
|
135
|
-
|
|
136
|
-
return int(retry_after_str)
|
|
135
|
+
return super().get_retry_delay(exc)
|
|
137
136
|
|
|
138
137
|
|
|
139
138
|
@pxt.udf
|
pixeltable/functions/deepseek.py
CHANGED
|
@@ -26,7 +26,7 @@ def _deepseek_client() -> 'openai.AsyncOpenAI':
|
|
|
26
26
|
return env.Env.get().get_client('deepseek')
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
@pxt.udf
|
|
29
|
+
@pxt.udf(resource_pool='request-rate:deepseek')
|
|
30
30
|
async def chat_completions(
|
|
31
31
|
messages: list,
|
|
32
32
|
*,
|
|
@@ -43,6 +43,10 @@ async def chat_completions(
|
|
|
43
43
|
|
|
44
44
|
Deepseek uses the OpenAI SDK, so you will need to install the `openai` package to use this UDF.
|
|
45
45
|
|
|
46
|
+
Request throttling:
|
|
47
|
+
Applies the rate limit set in the config (section `deepseek`, key `rate_limit`). If no rate
|
|
48
|
+
limit is configured, uses a default of 600 RPM.
|
|
49
|
+
|
|
46
50
|
__Requirements:__
|
|
47
51
|
|
|
48
52
|
- `pip install openai`
|
pixeltable/functions/gemini.py
CHANGED
|
@@ -14,6 +14,7 @@ import PIL.Image
|
|
|
14
14
|
|
|
15
15
|
import pixeltable as pxt
|
|
16
16
|
from pixeltable import env, exceptions as excs, exprs
|
|
17
|
+
from pixeltable.utils.media_store import TempStore
|
|
17
18
|
|
|
18
19
|
if TYPE_CHECKING:
|
|
19
20
|
from google import genai
|
|
@@ -39,7 +40,7 @@ async def generate_content(
|
|
|
39
40
|
<https://ai.google.dev/gemini-api/docs/text-generation>
|
|
40
41
|
|
|
41
42
|
Request throttling:
|
|
42
|
-
Applies the rate limit set in the config (section `gemini
|
|
43
|
+
Applies the rate limit set in the config (section `gemini.rate_limits`; use the model id as the key). If no rate
|
|
43
44
|
limit is configured, uses a default of 600 RPM.
|
|
44
45
|
|
|
45
46
|
__Requirements:__
|
|
@@ -126,6 +127,10 @@ async def generate_images(prompt: str, *, model: str, config: Optional[dict] = N
|
|
|
126
127
|
Generates images based on a text description and configuration. For additional details, see:
|
|
127
128
|
<https://ai.google.dev/gemini-api/docs/image-generation>
|
|
128
129
|
|
|
130
|
+
Request throttling:
|
|
131
|
+
Applies the rate limit set in the config (section `imagen.rate_limits`; use the model id as the key). If no rate
|
|
132
|
+
limit is configured, uses a default of 600 RPM.
|
|
133
|
+
|
|
129
134
|
__Requirements:__
|
|
130
135
|
|
|
131
136
|
- `pip install google-genai`
|
|
@@ -167,6 +172,10 @@ async def generate_videos(
|
|
|
167
172
|
Generates videos based on a text description and configuration. For additional details, see:
|
|
168
173
|
<https://ai.google.dev/gemini-api/docs/video-generation>
|
|
169
174
|
|
|
175
|
+
Request throttling:
|
|
176
|
+
Applies the rate limit set in the config (section `veo.rate_limits`; use the model id as the key). If no rate
|
|
177
|
+
limit is configured, uses a default of 600 RPM.
|
|
178
|
+
|
|
170
179
|
__Requirements:__
|
|
171
180
|
|
|
172
181
|
- `pip install google-genai`
|
|
@@ -215,7 +224,7 @@ async def generate_videos(
|
|
|
215
224
|
assert video_bytes is not None
|
|
216
225
|
|
|
217
226
|
# Create a temporary file to store the video bytes
|
|
218
|
-
output_path =
|
|
227
|
+
output_path = TempStore.create_path(extension='.mp4')
|
|
219
228
|
Path(output_path).write_bytes(video_bytes)
|
|
220
229
|
return str(output_path)
|
|
221
230
|
|