pixeltable 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +3 -11
- pixeltable/catalog/catalog.py +575 -220
- pixeltable/catalog/column.py +22 -23
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +2 -148
- pixeltable/catalog/insertable_table.py +15 -13
- pixeltable/catalog/path.py +6 -0
- pixeltable/catalog/schema_object.py +9 -4
- pixeltable/catalog/table.py +96 -85
- pixeltable/catalog/table_version.py +257 -174
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/tbl_ops.py +44 -0
- pixeltable/catalog/update_status.py +179 -0
- pixeltable/catalog/view.py +50 -56
- pixeltable/config.py +76 -12
- pixeltable/dataframe.py +19 -6
- pixeltable/env.py +50 -4
- pixeltable/exec/data_row_batch.py +3 -1
- pixeltable/exec/exec_node.py +7 -24
- pixeltable/exec/expr_eval/schedulers.py +134 -7
- pixeltable/exec/in_memory_data_node.py +6 -7
- pixeltable/exprs/column_property_ref.py +21 -9
- pixeltable/exprs/column_ref.py +7 -2
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/row_builder.py +10 -9
- pixeltable/exprs/rowid_ref.py +0 -4
- pixeltable/func/function.py +3 -3
- pixeltable/functions/audio.py +36 -9
- pixeltable/functions/gemini.py +4 -4
- pixeltable/functions/openai.py +1 -2
- pixeltable/functions/video.py +59 -16
- pixeltable/globals.py +109 -24
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/datarows.py +2 -1
- pixeltable/io/external_store.py +3 -55
- pixeltable/io/globals.py +4 -4
- pixeltable/io/hf_datasets.py +10 -2
- pixeltable/io/label_studio.py +16 -16
- pixeltable/io/pandas.py +1 -0
- pixeltable/io/table_data_conduit.py +12 -13
- pixeltable/iterators/audio.py +17 -8
- pixeltable/iterators/image.py +5 -2
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_39.py +125 -0
- pixeltable/metadata/converters/util.py +3 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +50 -1
- pixeltable/plan.py +4 -0
- pixeltable/share/packager.py +20 -38
- pixeltable/store.py +40 -51
- pixeltable/type_system.py +2 -2
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/media_store.py +50 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/METADATA +1 -1
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/RECORD +60 -57
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/entry_points.txt +0 -0
pixeltable/env.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import datetime
|
|
4
5
|
import glob
|
|
5
6
|
import http.server
|
|
@@ -19,9 +20,10 @@ from contextlib import contextmanager
|
|
|
19
20
|
from dataclasses import dataclass, field
|
|
20
21
|
from pathlib import Path
|
|
21
22
|
from sys import stdout
|
|
22
|
-
from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
|
|
23
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Optional, TypeVar
|
|
23
24
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
24
25
|
|
|
26
|
+
import nest_asyncio # type: ignore[import-untyped]
|
|
25
27
|
import pixeltable_pgserver
|
|
26
28
|
import sqlalchemy as sql
|
|
27
29
|
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
@@ -84,7 +86,9 @@ class Env:
|
|
|
84
86
|
_resource_pool_info: dict[str, Any]
|
|
85
87
|
_current_conn: Optional[sql.Connection]
|
|
86
88
|
_current_session: Optional[sql.orm.Session]
|
|
89
|
+
_current_isolation_level: Optional[Literal['REPEATABLE_READ', 'SERIALIZABLE']]
|
|
87
90
|
_dbms: Optional[Dbms]
|
|
91
|
+
_event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
|
|
88
92
|
|
|
89
93
|
@classmethod
|
|
90
94
|
def get(cls) -> Env:
|
|
@@ -96,6 +100,7 @@ class Env:
|
|
|
96
100
|
def _init_env(cls, reinit_db: bool = False) -> None:
|
|
97
101
|
assert not cls.__initializing, 'Circular env initialization detected.'
|
|
98
102
|
cls.__initializing = True
|
|
103
|
+
cls._instance = None
|
|
99
104
|
env = Env()
|
|
100
105
|
env._set_up(reinit_db=reinit_db)
|
|
101
106
|
env._upgrade_metadata()
|
|
@@ -139,7 +144,34 @@ class Env:
|
|
|
139
144
|
self._resource_pool_info = {}
|
|
140
145
|
self._current_conn = None
|
|
141
146
|
self._current_session = None
|
|
147
|
+
self._current_isolation_level = None
|
|
142
148
|
self._dbms = None
|
|
149
|
+
self._event_loop = None
|
|
150
|
+
|
|
151
|
+
def _init_event_loop(self) -> None:
|
|
152
|
+
try:
|
|
153
|
+
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
154
|
+
# multiple run_until_complete()
|
|
155
|
+
running_loop = asyncio.get_running_loop()
|
|
156
|
+
self._event_loop = running_loop
|
|
157
|
+
_logger.debug('Patched running loop')
|
|
158
|
+
except RuntimeError:
|
|
159
|
+
self._event_loop = asyncio.new_event_loop()
|
|
160
|
+
asyncio.set_event_loop(self._event_loop)
|
|
161
|
+
# we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
|
|
162
|
+
self._event_loop.slow_callback_duration = 3600
|
|
163
|
+
|
|
164
|
+
# always allow nested event loops, we need that to run async udfs synchronously (eg, for SimilarityExpr);
|
|
165
|
+
# see run_coroutine_synchronously()
|
|
166
|
+
nest_asyncio.apply()
|
|
167
|
+
if _logger.isEnabledFor(logging.DEBUG):
|
|
168
|
+
self._event_loop.set_debug(True)
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def event_loop(self) -> asyncio.AbstractEventLoop:
|
|
172
|
+
if self._event_loop is None:
|
|
173
|
+
self._init_event_loop()
|
|
174
|
+
return self._event_loop
|
|
143
175
|
|
|
144
176
|
@property
|
|
145
177
|
def db_url(self) -> str:
|
|
@@ -201,20 +233,34 @@ class Env:
|
|
|
201
233
|
return self._db_server is not None
|
|
202
234
|
|
|
203
235
|
@contextmanager
|
|
204
|
-
def begin_xact(self) -> Iterator[sql.Connection]:
|
|
205
|
-
"""
|
|
236
|
+
def begin_xact(self, for_write: bool = False) -> Iterator[sql.Connection]:
|
|
237
|
+
"""
|
|
238
|
+
Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
|
|
239
|
+
|
|
240
|
+
for_write: if True, uses serializable isolation; if False, uses repeatable_read
|
|
241
|
+
|
|
242
|
+
TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
|
|
243
|
+
that avoids tripping over any pending ops
|
|
244
|
+
"""
|
|
206
245
|
if self._current_conn is None:
|
|
207
246
|
assert self._current_session is None
|
|
208
247
|
try:
|
|
209
|
-
|
|
248
|
+
self._current_isolation_level = 'SERIALIZABLE' if for_write else 'REPEATABLE_READ'
|
|
249
|
+
with (
|
|
250
|
+
self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
|
|
251
|
+
sql.orm.Session(conn) as session,
|
|
252
|
+
conn.begin(),
|
|
253
|
+
):
|
|
210
254
|
self._current_conn = conn
|
|
211
255
|
self._current_session = session
|
|
212
256
|
yield conn
|
|
213
257
|
finally:
|
|
214
258
|
self._current_session = None
|
|
215
259
|
self._current_conn = None
|
|
260
|
+
self._current_isolation_level = None
|
|
216
261
|
else:
|
|
217
262
|
assert self._current_session is not None
|
|
263
|
+
assert for_write == (self._current_isolation_level == 'serializable')
|
|
218
264
|
yield self._current_conn
|
|
219
265
|
|
|
220
266
|
def configure_logging(
|
|
@@ -90,7 +90,9 @@ class DataRowBatch:
|
|
|
90
90
|
idx_range = slice(0, len(self.rows))
|
|
91
91
|
for row in self.rows[idx_range]:
|
|
92
92
|
for info in stored_img_info:
|
|
93
|
-
|
|
93
|
+
col = info.col
|
|
94
|
+
assert col.tbl.id == self.tbl.id
|
|
95
|
+
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
|
|
94
96
|
row.flush_img(info.slot_idx, filepath)
|
|
95
97
|
for slot_idx in flushed_slot_idxs:
|
|
96
98
|
row.flush_img(slot_idx)
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
|
-
import asyncio
|
|
5
4
|
import logging
|
|
6
5
|
from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
|
|
7
6
|
|
|
8
7
|
from pixeltable import exprs
|
|
8
|
+
from pixeltable.env import Env
|
|
9
9
|
|
|
10
10
|
from .data_row_batch import DataRowBatch
|
|
11
11
|
from .exec_context import ExecContext
|
|
@@ -59,26 +59,7 @@ class ExecNode(abc.ABC):
|
|
|
59
59
|
pass
|
|
60
60
|
|
|
61
61
|
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
62
|
-
|
|
63
|
-
loop: asyncio.AbstractEventLoop
|
|
64
|
-
try:
|
|
65
|
-
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
66
|
-
# multiple run_until_complete()
|
|
67
|
-
running_loop = asyncio.get_running_loop()
|
|
68
|
-
import nest_asyncio # type: ignore[import-untyped]
|
|
69
|
-
|
|
70
|
-
nest_asyncio.apply()
|
|
71
|
-
loop = running_loop
|
|
72
|
-
_logger.debug('Patched running loop')
|
|
73
|
-
except RuntimeError:
|
|
74
|
-
loop = asyncio.new_event_loop()
|
|
75
|
-
asyncio.set_event_loop(loop)
|
|
76
|
-
# we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
|
|
77
|
-
loop.slow_callback_duration = 3600
|
|
78
|
-
|
|
79
|
-
if _logger.isEnabledFor(logging.DEBUG):
|
|
80
|
-
loop.set_debug(True)
|
|
81
|
-
|
|
62
|
+
loop = Env.get().event_loop
|
|
82
63
|
aiter = self.__aiter__()
|
|
83
64
|
try:
|
|
84
65
|
while True:
|
|
@@ -86,9 +67,11 @@ class ExecNode(abc.ABC):
|
|
|
86
67
|
yield batch
|
|
87
68
|
except StopAsyncIteration:
|
|
88
69
|
pass
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
70
|
+
# TODO:
|
|
71
|
+
# - we seem to have some tasks that aren't accounted for by ExprEvalNode and don't get cancelled by the time
|
|
72
|
+
# we end up here
|
|
73
|
+
# - however, blindly cancelling all pending tasks doesn't work when running in a jupyter environment, which
|
|
74
|
+
# creates tasks on its own
|
|
92
75
|
|
|
93
76
|
def open(self) -> None:
|
|
94
77
|
"""Bottom-up initialization of nodes for execution. Must be called before __next__."""
|
|
@@ -4,9 +4,10 @@ import asyncio
|
|
|
4
4
|
import datetime
|
|
5
5
|
import inspect
|
|
6
6
|
import logging
|
|
7
|
+
import re
|
|
7
8
|
import sys
|
|
8
9
|
import time
|
|
9
|
-
from typing import Awaitable, Collection, Optional
|
|
10
|
+
from typing import Any, Awaitable, Collection, Optional
|
|
10
11
|
|
|
11
12
|
from pixeltable import env, func
|
|
12
13
|
from pixeltable.config import Config
|
|
@@ -250,8 +251,20 @@ class RequestRateScheduler(Scheduler):
|
|
|
250
251
|
total_retried: int
|
|
251
252
|
|
|
252
253
|
TIME_FORMAT = '%H:%M.%S %f'
|
|
253
|
-
MAX_RETRIES =
|
|
254
|
+
MAX_RETRIES = 3
|
|
254
255
|
DEFAULT_RATE_LIMIT = 600 # requests per minute
|
|
256
|
+
RATE_LIMIT_INDICATORS = ('rate limit', 'too many requests', '429', 'quota exceeded', 'throttled', 'rate exceeded')
|
|
257
|
+
RETRY_AFTER_PATTERNS = (
|
|
258
|
+
r'retry after (\d+(?:\.\d+)?)\s*seconds?',
|
|
259
|
+
r'try again in (\d+(?:\.\d+)?)\s*seconds?',
|
|
260
|
+
r'wait (\d+(?:\.\d+)?)\s*seconds?',
|
|
261
|
+
r'retry-after:\s*(\d+(?:\.\d+)?)',
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# Exponential backoff defaults
|
|
265
|
+
BASE_RETRY_DELAY = 1.0 # in seconds
|
|
266
|
+
MAX_RETRY_DELAY = 60.0 # in seconds
|
|
267
|
+
RETRY_BACKOFF_MULTIPLIER = 2.0
|
|
255
268
|
|
|
256
269
|
def __init__(self, resource_pool: str, dispatcher: Dispatcher):
|
|
257
270
|
super().__init__(resource_pool, dispatcher)
|
|
@@ -337,11 +350,12 @@ class RequestRateScheduler(Scheduler):
|
|
|
337
350
|
self.dispatcher.dispatch(request.rows, exec_ctx)
|
|
338
351
|
|
|
339
352
|
except Exception as exc:
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
353
|
+
_logger.debug(f'exception for {self.resource_pool}: type={type(exc)}\n{exc}')
|
|
354
|
+
is_rate_limit_error, retry_after = self._is_rate_limit_error(exc)
|
|
355
|
+
if is_rate_limit_error and num_retries < self.MAX_RETRIES:
|
|
356
|
+
retry_delay = self._compute_retry_delay(num_retries, retry_after)
|
|
357
|
+
_logger.debug(f'scheduler {self.resource_pool}: retrying after {retry_delay}')
|
|
358
|
+
await asyncio.sleep(retry_delay)
|
|
345
359
|
self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
|
|
346
360
|
return
|
|
347
361
|
|
|
@@ -358,6 +372,119 @@ class RequestRateScheduler(Scheduler):
|
|
|
358
372
|
if is_task:
|
|
359
373
|
self.num_in_flight -= 1
|
|
360
374
|
|
|
375
|
+
def _is_rate_limit_error(self, exc: Exception) -> tuple[bool, Optional[float]]:
|
|
376
|
+
"""Returns True if the exception indicates a rate limit error, and the retry delay in seconds."""
|
|
377
|
+
from http import HTTPStatus
|
|
378
|
+
|
|
379
|
+
# Check for HTTP status TOO_MANY_REQUESTS in various exception classes.
|
|
380
|
+
# We look for attributes that contain status codes, instead of checking the type of the exception,
|
|
381
|
+
# in order to handle a wider variety of exception classes.
|
|
382
|
+
is_rate_limit_error = False
|
|
383
|
+
retry_delay: Optional[float] = None
|
|
384
|
+
|
|
385
|
+
# requests.HTTPError/httpx.HTTPStatusError
|
|
386
|
+
if (
|
|
387
|
+
hasattr(exc, 'response')
|
|
388
|
+
and hasattr(exc.response, 'status_code')
|
|
389
|
+
and exc.response.status_code == HTTPStatus.TOO_MANY_REQUESTS.value
|
|
390
|
+
):
|
|
391
|
+
is_rate_limit_error = True
|
|
392
|
+
retry_delay = self._extract_retry_delay_from_headers(exc.response.headers)
|
|
393
|
+
elif (
|
|
394
|
+
# urllib.error.HTTPError
|
|
395
|
+
(hasattr(exc, 'code') and exc.code == HTTPStatus.TOO_MANY_REQUESTS.value)
|
|
396
|
+
# aiohttp.ClientResponseError
|
|
397
|
+
or (hasattr(exc, 'status') and exc.status == HTTPStatus.TOO_MANY_REQUESTS.value)
|
|
398
|
+
) and hasattr(exc, 'headers'):
|
|
399
|
+
is_rate_limit_error = True
|
|
400
|
+
retry_delay = self._extract_retry_delay_from_headers(exc.headers)
|
|
401
|
+
|
|
402
|
+
if is_rate_limit_error:
|
|
403
|
+
return True, retry_delay
|
|
404
|
+
|
|
405
|
+
# Check common rate limit keywords in exception message
|
|
406
|
+
error_msg = str(exc).lower()
|
|
407
|
+
if any(indicator in error_msg for indicator in self.RATE_LIMIT_INDICATORS):
|
|
408
|
+
retry_delay = self._extract_retry_delay_from_message(error_msg)
|
|
409
|
+
return True, retry_delay
|
|
410
|
+
|
|
411
|
+
return False, None
|
|
412
|
+
|
|
413
|
+
def _extract_retry_delay_from_headers(self, headers: Optional[Any]) -> Optional[float]:
|
|
414
|
+
"""Extract retry delay from HTTP headers."""
|
|
415
|
+
if headers is None:
|
|
416
|
+
return None
|
|
417
|
+
|
|
418
|
+
# convert headers to dict-like object for consistent access
|
|
419
|
+
header_dict: dict
|
|
420
|
+
if hasattr(headers, 'get'):
|
|
421
|
+
header_dict = headers
|
|
422
|
+
else:
|
|
423
|
+
# headers are a list of tuples or other format
|
|
424
|
+
try:
|
|
425
|
+
header_dict = dict(headers)
|
|
426
|
+
except (TypeError, ValueError):
|
|
427
|
+
return None
|
|
428
|
+
# normalize dict keys: lowercase and remove dashes
|
|
429
|
+
header_dict = {k.lower().replace('-', ''): v for k, v in header_dict.items()}
|
|
430
|
+
|
|
431
|
+
# check Retry-After header
|
|
432
|
+
retry_after = header_dict.get('retryafter')
|
|
433
|
+
if retry_after is not None:
|
|
434
|
+
try:
|
|
435
|
+
return float(retry_after)
|
|
436
|
+
except (ValueError, TypeError):
|
|
437
|
+
pass
|
|
438
|
+
|
|
439
|
+
# check X-RateLimit-Reset (Unix timestamp)
|
|
440
|
+
reset_time = header_dict.get('xratelimitreset')
|
|
441
|
+
if reset_time is not None:
|
|
442
|
+
try:
|
|
443
|
+
reset_timestamp = float(reset_time)
|
|
444
|
+
delay = max(0, reset_timestamp - time.time())
|
|
445
|
+
return delay
|
|
446
|
+
except (ValueError, TypeError):
|
|
447
|
+
pass
|
|
448
|
+
|
|
449
|
+
# check X-RateLimit-Reset-After (seconds from now)
|
|
450
|
+
reset_after = header_dict.get('xratelimitresetafter')
|
|
451
|
+
if reset_after is not None:
|
|
452
|
+
try:
|
|
453
|
+
return float(reset_after)
|
|
454
|
+
except (ValueError, TypeError):
|
|
455
|
+
pass
|
|
456
|
+
|
|
457
|
+
return None
|
|
458
|
+
|
|
459
|
+
def _extract_retry_delay_from_message(self, msg: str) -> Optional[float]:
|
|
460
|
+
msg_lower = msg.lower()
|
|
461
|
+
for pattern in self.RETRY_AFTER_PATTERNS:
|
|
462
|
+
match = re.search(pattern, msg_lower)
|
|
463
|
+
if match is not None:
|
|
464
|
+
try:
|
|
465
|
+
return float(match.group(1))
|
|
466
|
+
except (ValueError, TypeError):
|
|
467
|
+
continue
|
|
468
|
+
return None
|
|
469
|
+
|
|
470
|
+
def _compute_retry_delay(self, num_retries: int, retry_after: Optional[float] = None) -> float:
|
|
471
|
+
"""
|
|
472
|
+
Calculate exponential backoff delay for rate limit errors.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
retry_count: Number of retries attempted (0-based)
|
|
476
|
+
retry_after: Suggested delay from Retry-After header
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
Delay in seconds
|
|
480
|
+
"""
|
|
481
|
+
if retry_after is not None and retry_after > 0:
|
|
482
|
+
# Use server-suggested delay, but cap it at max_delay
|
|
483
|
+
return max(min(retry_after, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
|
|
484
|
+
else:
|
|
485
|
+
delay = self.BASE_RETRY_DELAY * (self.RETRY_BACKOFF_MULTIPLIER**num_retries)
|
|
486
|
+
return max(min(delay, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
|
|
487
|
+
|
|
361
488
|
|
|
362
489
|
# all concrete Scheduler subclasses that implement matches()
|
|
363
490
|
SCHEDULERS = [RateLimitsScheduler, RequestRateScheduler]
|
|
@@ -63,13 +63,12 @@ class InMemoryDataNode(ExecNode):
|
|
|
63
63
|
for col_name, val in input_row.items():
|
|
64
64
|
col_info = user_cols_by_name.get(col_name)
|
|
65
65
|
assert col_info is not None
|
|
66
|
-
|
|
67
|
-
if
|
|
68
|
-
# this is a literal
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
self.output_rows[row_idx][col_info.slot_idx] = path
|
|
66
|
+
col = col_info.col
|
|
67
|
+
if col.col_type.is_image_type() and isinstance(val, bytes):
|
|
68
|
+
# this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
|
|
69
|
+
assert col.tbl.id == self.tbl.id
|
|
70
|
+
path = MediaStore.save_media_file(val, col.tbl.id, col.id, col.tbl.version)
|
|
71
|
+
self.output_rows[row_idx][col_info.slot_idx] = str(path)
|
|
73
72
|
else:
|
|
74
73
|
self.output_rows[row_idx][col_info.slot_idx] = val
|
|
75
74
|
|
|
@@ -26,6 +26,7 @@ class ColumnPropertyRef(Expr):
|
|
|
26
26
|
ERRORMSG = 1
|
|
27
27
|
FILEURL = 2
|
|
28
28
|
LOCALPATH = 3
|
|
29
|
+
CELLMD = 4 # JSON metadata for the cell, e.g. errortype, errormsg for media columns
|
|
29
30
|
|
|
30
31
|
def __init__(self, col_ref: ColumnRef, prop: Property):
|
|
31
32
|
super().__init__(ts.StringType(nullable=True))
|
|
@@ -51,8 +52,8 @@ class ColumnPropertyRef(Expr):
|
|
|
51
52
|
def __repr__(self) -> str:
|
|
52
53
|
return f'{self._col_ref}.{self.prop.name.lower()}'
|
|
53
54
|
|
|
54
|
-
def
|
|
55
|
-
return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG)
|
|
55
|
+
def is_cellmd_prop(self) -> bool:
|
|
56
|
+
return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG, self.Property.CELLMD)
|
|
56
57
|
|
|
57
58
|
def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
58
59
|
if not self._col_ref.col_handle.get().is_stored:
|
|
@@ -63,21 +64,27 @@ class ColumnPropertyRef(Expr):
|
|
|
63
64
|
if (
|
|
64
65
|
col.col_type.is_media_type()
|
|
65
66
|
and col.media_validation == catalog.MediaValidation.ON_READ
|
|
66
|
-
and self.
|
|
67
|
+
and self.is_cellmd_prop()
|
|
67
68
|
):
|
|
68
69
|
return None
|
|
69
70
|
|
|
70
71
|
if self.prop == self.Property.ERRORTYPE:
|
|
71
|
-
|
|
72
|
-
return col.sa_errortype_col
|
|
72
|
+
return col.sa_cellmd_col.op('->>')('errortype')
|
|
73
73
|
if self.prop == self.Property.ERRORMSG:
|
|
74
|
-
|
|
75
|
-
|
|
74
|
+
return col.sa_cellmd_col.op('->>')('errormsg')
|
|
75
|
+
if self.prop == self.Property.CELLMD:
|
|
76
|
+
assert col.sa_cellmd_col is not None
|
|
77
|
+
return col.sa_cellmd_col
|
|
76
78
|
if self.prop == self.Property.FILEURL:
|
|
77
79
|
# the file url is stored as the column value
|
|
78
80
|
return sql_elements.get(self._col_ref)
|
|
79
81
|
return None
|
|
80
82
|
|
|
83
|
+
@classmethod
|
|
84
|
+
def create_cellmd_exc(cls, exc: Exception) -> dict[str, str]:
|
|
85
|
+
"""Create a cellmd value from an exception."""
|
|
86
|
+
return {'errortype': type(exc).__name__, 'errormsg': str(exc)}
|
|
87
|
+
|
|
81
88
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
82
89
|
if self.prop == self.Property.FILEURL:
|
|
83
90
|
assert data_row.has_val[self._col_ref.slot_idx]
|
|
@@ -87,14 +94,19 @@ class ColumnPropertyRef(Expr):
|
|
|
87
94
|
assert data_row.has_val[self._col_ref.slot_idx]
|
|
88
95
|
data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
|
|
89
96
|
return
|
|
90
|
-
elif self.
|
|
97
|
+
elif self.is_cellmd_prop():
|
|
91
98
|
exc = data_row.get_exc(self._col_ref.slot_idx)
|
|
92
99
|
if exc is None:
|
|
93
100
|
data_row[self.slot_idx] = None
|
|
94
101
|
elif self.prop == self.Property.ERRORTYPE:
|
|
95
102
|
data_row[self.slot_idx] = type(exc).__name__
|
|
96
|
-
|
|
103
|
+
elif self.prop == self.Property.ERRORMSG:
|
|
97
104
|
data_row[self.slot_idx] = str(exc)
|
|
105
|
+
elif self.prop == self.Property.CELLMD:
|
|
106
|
+
data_row[self.slot_idx] = self.create_cellmd_exc(exc)
|
|
107
|
+
else:
|
|
108
|
+
raise AssertionError(f'Unknown property {self.prop}')
|
|
109
|
+
return
|
|
98
110
|
else:
|
|
99
111
|
raise AssertionError()
|
|
100
112
|
|
pixeltable/exprs/column_ref.py
CHANGED
|
@@ -115,11 +115,15 @@ class ColumnRef(Expr):
|
|
|
115
115
|
from .column_property_ref import ColumnPropertyRef
|
|
116
116
|
|
|
117
117
|
# resolve column properties
|
|
118
|
+
if name == ColumnPropertyRef.Property.CELLMD.name.lower():
|
|
119
|
+
# This is not user accessible, but used internally to store cell metadata
|
|
120
|
+
return super().__getattr__(name)
|
|
121
|
+
|
|
118
122
|
if (
|
|
119
123
|
name == ColumnPropertyRef.Property.ERRORTYPE.name.lower()
|
|
120
124
|
or name == ColumnPropertyRef.Property.ERRORMSG.name.lower()
|
|
121
125
|
):
|
|
122
|
-
property_is_present = self.col.
|
|
126
|
+
property_is_present = self.col.stores_cellmd
|
|
123
127
|
if not property_is_present:
|
|
124
128
|
raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
|
|
125
129
|
return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
|
|
@@ -321,7 +325,8 @@ class ColumnRef(Expr):
|
|
|
321
325
|
@classmethod
|
|
322
326
|
def get_column(cls, d: dict) -> catalog.Column:
|
|
323
327
|
tbl_id, version, col_id = UUID(d['tbl_id']), d['tbl_version'], d['col_id']
|
|
324
|
-
|
|
328
|
+
# validate_initialized=False: this gets called as part of TableVersion.init()
|
|
329
|
+
tbl_version = catalog.Catalog.get().get_tbl_version(tbl_id, version, validate_initialized=False)
|
|
325
330
|
# don't use tbl_version.cols_by_id here, this might be a snapshot reference to a column that was then dropped
|
|
326
331
|
col = next(col for col in tbl_version.cols if col.id == col_id)
|
|
327
332
|
return col
|
|
@@ -446,11 +446,11 @@ class FunctionCall(Expr):
|
|
|
446
446
|
dedent(
|
|
447
447
|
f"""
|
|
448
448
|
The UDF '{fn.self_path}' cannot be located, because
|
|
449
|
-
{{
|
|
449
|
+
{{error_msg}}
|
|
450
450
|
"""
|
|
451
451
|
)
|
|
452
452
|
.strip()
|
|
453
|
-
.format(
|
|
453
|
+
.format(error_msg=fn.error_msg)
|
|
454
454
|
)
|
|
455
455
|
return cls(fn, args, kwargs, return_type, is_method_call=is_method_call, validation_error=validation_error)
|
|
456
456
|
|
pixeltable/exprs/row_builder.py
CHANGED
|
@@ -209,7 +209,7 @@ class RowBuilder:
|
|
|
209
209
|
# this is input and therefore doesn't depend on other exprs
|
|
210
210
|
continue
|
|
211
211
|
# error properties don't have exceptions themselves
|
|
212
|
-
if isinstance(expr, ColumnPropertyRef) and expr.
|
|
212
|
+
if isinstance(expr, ColumnPropertyRef) and expr.is_cellmd_prop():
|
|
213
213
|
continue
|
|
214
214
|
dependency_idxs = [d.slot_idx for d in expr.dependencies()]
|
|
215
215
|
self.dependencies[expr.slot_idx, dependency_idxs] = True
|
|
@@ -444,6 +444,8 @@ class RowBuilder:
|
|
|
444
444
|
Return tuple[list of row values in `self.table_columns` order, # of exceptions]
|
|
445
445
|
This excludes system columns.
|
|
446
446
|
"""
|
|
447
|
+
from pixeltable.exprs.column_property_ref import ColumnPropertyRef
|
|
448
|
+
|
|
447
449
|
num_excs = 0
|
|
448
450
|
table_row: list[Any] = list(pk)
|
|
449
451
|
for info in self.table_columns:
|
|
@@ -454,9 +456,9 @@ class RowBuilder:
|
|
|
454
456
|
if cols_with_excs is not None:
|
|
455
457
|
cols_with_excs.add(col.id)
|
|
456
458
|
table_row.append(None)
|
|
457
|
-
if col.
|
|
458
|
-
# exceptions get stored in the errortype/-msg
|
|
459
|
-
table_row.
|
|
459
|
+
if col.stores_cellmd:
|
|
460
|
+
# exceptions get stored in the errortype/-msg properties of the cellmd column
|
|
461
|
+
table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
|
|
460
462
|
else:
|
|
461
463
|
if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
|
|
462
464
|
# we have yet to store this image
|
|
@@ -464,8 +466,8 @@ class RowBuilder:
|
|
|
464
466
|
data_row.flush_img(slot_idx, filepath)
|
|
465
467
|
val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
|
|
466
468
|
table_row.append(val)
|
|
467
|
-
if col.
|
|
468
|
-
table_row.
|
|
469
|
+
if col.stores_cellmd:
|
|
470
|
+
table_row.append(None) # placeholder for cellmd column
|
|
469
471
|
|
|
470
472
|
return table_row, num_excs
|
|
471
473
|
|
|
@@ -483,8 +485,7 @@ class RowBuilder:
|
|
|
483
485
|
if col.col.col_type.is_media_type():
|
|
484
486
|
media_cols[len(store_col_names)] = col.col
|
|
485
487
|
store_col_names.append(col.col.store_name())
|
|
486
|
-
if col.col.
|
|
487
|
-
store_col_names.append(col.col.
|
|
488
|
-
store_col_names.append(col.col.errormsg_store_name())
|
|
488
|
+
if col.col.stores_cellmd:
|
|
489
|
+
store_col_names.append(col.col.cellmd_store_name())
|
|
489
490
|
|
|
490
491
|
return store_col_names, media_cols
|
pixeltable/exprs/rowid_ref.py
CHANGED
|
@@ -105,10 +105,6 @@ class RowidRef(Expr):
|
|
|
105
105
|
assert self.rowid_component_idx <= len(rowid_cols), (
|
|
106
106
|
f'{self.rowid_component_idx} not consistent with {rowid_cols}'
|
|
107
107
|
)
|
|
108
|
-
# _logger.debug(
|
|
109
|
-
# f'RowidRef.sql_expr: tbl={tbl.id}{tbl.effective_version} sa_tbl={id(tbl.store_tbl.sa_tbl):x} '
|
|
110
|
-
# f'tv={id(tbl):x}'
|
|
111
|
-
# )
|
|
112
108
|
return rowid_cols[self.rowid_component_idx]
|
|
113
109
|
|
|
114
110
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
pixeltable/func/function.py
CHANGED
|
@@ -504,12 +504,12 @@ class Function(ABC):
|
|
|
504
504
|
|
|
505
505
|
class InvalidFunction(Function):
|
|
506
506
|
fn_dict: dict[str, Any]
|
|
507
|
-
|
|
507
|
+
error_msg: str
|
|
508
508
|
|
|
509
|
-
def __init__(self, self_path: str, fn_dict: dict[str, Any],
|
|
509
|
+
def __init__(self, self_path: str, fn_dict: dict[str, Any], error_msg: str):
|
|
510
510
|
super().__init__([], self_path)
|
|
511
511
|
self.fn_dict = fn_dict
|
|
512
|
-
self.
|
|
512
|
+
self.error_msg = error_msg
|
|
513
513
|
|
|
514
514
|
def _as_dict(self) -> dict:
|
|
515
515
|
"""
|
pixeltable/functions/audio.py
CHANGED
|
@@ -1,14 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `AudioType`.
|
|
3
|
-
|
|
4
|
-
Example:
|
|
5
|
-
```python
|
|
6
|
-
import pixeltable as pxt
|
|
7
|
-
import pixeltable.functions as pxtf
|
|
8
|
-
|
|
9
|
-
t = pxt.get_table(...)
|
|
10
|
-
t.select(pxtf.audio.get_metadata()).collect()
|
|
11
|
-
```
|
|
12
3
|
"""
|
|
13
4
|
|
|
14
5
|
import pixeltable as pxt
|
|
@@ -19,6 +10,42 @@ from pixeltable.utils.code import local_public_names
|
|
|
19
10
|
def get_metadata(audio: pxt.Audio) -> dict:
|
|
20
11
|
"""
|
|
21
12
|
Gets various metadata associated with an audio file and returns it as a dictionary.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
audio: The audio to get metadata for.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
A `dict` such as the following:
|
|
19
|
+
|
|
20
|
+
```json
|
|
21
|
+
{
|
|
22
|
+
'size': 2568827,
|
|
23
|
+
'streams': [
|
|
24
|
+
{
|
|
25
|
+
'type': 'audio',
|
|
26
|
+
'frames': 0,
|
|
27
|
+
'duration': 2646000,
|
|
28
|
+
'metadata': {},
|
|
29
|
+
'time_base': 2.2675736961451248e-05,
|
|
30
|
+
'codec_context': {
|
|
31
|
+
'name': 'flac',
|
|
32
|
+
'profile': None,
|
|
33
|
+
'channels': 1,
|
|
34
|
+
'codec_tag': '\\x00\\x00\\x00\\x00',
|
|
35
|
+
},
|
|
36
|
+
'duration_seconds': 60.0,
|
|
37
|
+
}
|
|
38
|
+
],
|
|
39
|
+
'bit_rate': 342510,
|
|
40
|
+
'metadata': {'encoder': 'Lavf61.1.100'},
|
|
41
|
+
'bit_exact': False,
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Examples:
|
|
46
|
+
Extract metadata for files in the `audio_col` column of the table `tbl`:
|
|
47
|
+
|
|
48
|
+
>>> tbl.select(tbl.audio_col.get_metadata()).collect()
|
|
22
49
|
"""
|
|
23
50
|
return pxt.functions.video._get_metadata(audio)
|
|
24
51
|
|
pixeltable/functions/gemini.py
CHANGED
|
@@ -7,7 +7,6 @@ the [Working with Gemini](https://pixeltable.readme.io/docs/working-with-gemini)
|
|
|
7
7
|
|
|
8
8
|
import asyncio
|
|
9
9
|
import io
|
|
10
|
-
import tempfile
|
|
11
10
|
from pathlib import Path
|
|
12
11
|
from typing import TYPE_CHECKING, Optional
|
|
13
12
|
|
|
@@ -215,9 +214,10 @@ async def generate_videos(
|
|
|
215
214
|
video_bytes = await _genai_client().aio.files.download(file=video.video) # type: ignore[arg-type]
|
|
216
215
|
assert video_bytes is not None
|
|
217
216
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
217
|
+
# Create a temporary file to store the video bytes
|
|
218
|
+
output_path = env.Env.get().create_tmp_path('.mp4')
|
|
219
|
+
Path(output_path).write_bytes(video_bytes)
|
|
220
|
+
return str(output_path)
|
|
221
221
|
|
|
222
222
|
|
|
223
223
|
@generate_videos.resource_pool
|
pixeltable/functions/openai.py
CHANGED
|
@@ -13,7 +13,6 @@ import logging
|
|
|
13
13
|
import math
|
|
14
14
|
import pathlib
|
|
15
15
|
import re
|
|
16
|
-
import uuid
|
|
17
16
|
from typing import TYPE_CHECKING, Any, Callable, Optional, Type
|
|
18
17
|
|
|
19
18
|
import httpx
|
|
@@ -207,7 +206,7 @@ async def speech(input: str, *, model: str, voice: str, model_kwargs: Optional[d
|
|
|
207
206
|
|
|
208
207
|
content = await _openai_client().audio.speech.create(input=input, model=model, voice=voice, **model_kwargs)
|
|
209
208
|
ext = model_kwargs.get('response_format', 'mp3')
|
|
210
|
-
output_filename = str(env.Env.get().
|
|
209
|
+
output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
|
|
211
210
|
content.write_to_file(output_filename)
|
|
212
211
|
return output_filename
|
|
213
212
|
|