pixeltable 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (60) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +3 -11
  4. pixeltable/catalog/catalog.py +575 -220
  5. pixeltable/catalog/column.py +22 -23
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +2 -148
  8. pixeltable/catalog/insertable_table.py +15 -13
  9. pixeltable/catalog/path.py +6 -0
  10. pixeltable/catalog/schema_object.py +9 -4
  11. pixeltable/catalog/table.py +96 -85
  12. pixeltable/catalog/table_version.py +257 -174
  13. pixeltable/catalog/table_version_path.py +1 -1
  14. pixeltable/catalog/tbl_ops.py +44 -0
  15. pixeltable/catalog/update_status.py +179 -0
  16. pixeltable/catalog/view.py +50 -56
  17. pixeltable/config.py +76 -12
  18. pixeltable/dataframe.py +19 -6
  19. pixeltable/env.py +50 -4
  20. pixeltable/exec/data_row_batch.py +3 -1
  21. pixeltable/exec/exec_node.py +7 -24
  22. pixeltable/exec/expr_eval/schedulers.py +134 -7
  23. pixeltable/exec/in_memory_data_node.py +6 -7
  24. pixeltable/exprs/column_property_ref.py +21 -9
  25. pixeltable/exprs/column_ref.py +7 -2
  26. pixeltable/exprs/function_call.py +2 -2
  27. pixeltable/exprs/row_builder.py +10 -9
  28. pixeltable/exprs/rowid_ref.py +0 -4
  29. pixeltable/func/function.py +3 -3
  30. pixeltable/functions/audio.py +36 -9
  31. pixeltable/functions/gemini.py +4 -4
  32. pixeltable/functions/openai.py +1 -2
  33. pixeltable/functions/video.py +59 -16
  34. pixeltable/globals.py +109 -24
  35. pixeltable/io/__init__.py +1 -1
  36. pixeltable/io/datarows.py +2 -1
  37. pixeltable/io/external_store.py +3 -55
  38. pixeltable/io/globals.py +4 -4
  39. pixeltable/io/hf_datasets.py +10 -2
  40. pixeltable/io/label_studio.py +16 -16
  41. pixeltable/io/pandas.py +1 -0
  42. pixeltable/io/table_data_conduit.py +12 -13
  43. pixeltable/iterators/audio.py +17 -8
  44. pixeltable/iterators/image.py +5 -2
  45. pixeltable/metadata/__init__.py +1 -1
  46. pixeltable/metadata/converters/convert_39.py +125 -0
  47. pixeltable/metadata/converters/util.py +3 -0
  48. pixeltable/metadata/notes.py +1 -0
  49. pixeltable/metadata/schema.py +50 -1
  50. pixeltable/plan.py +4 -0
  51. pixeltable/share/packager.py +20 -38
  52. pixeltable/store.py +40 -51
  53. pixeltable/type_system.py +2 -2
  54. pixeltable/utils/coroutine.py +6 -23
  55. pixeltable/utils/media_store.py +50 -0
  56. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/METADATA +1 -1
  57. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/RECORD +60 -57
  58. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/LICENSE +0 -0
  59. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/WHEEL +0 -0
  60. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/entry_points.txt +0 -0
pixeltable/env.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  import datetime
4
5
  import glob
5
6
  import http.server
@@ -19,9 +20,10 @@ from contextlib import contextmanager
19
20
  from dataclasses import dataclass, field
20
21
  from pathlib import Path
21
22
  from sys import stdout
22
- from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
23
+ from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Optional, TypeVar
23
24
  from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
24
25
 
26
+ import nest_asyncio # type: ignore[import-untyped]
25
27
  import pixeltable_pgserver
26
28
  import sqlalchemy as sql
27
29
  from pillow_heif import register_heif_opener # type: ignore[import-untyped]
@@ -84,7 +86,9 @@ class Env:
84
86
  _resource_pool_info: dict[str, Any]
85
87
  _current_conn: Optional[sql.Connection]
86
88
  _current_session: Optional[sql.orm.Session]
89
+ _current_isolation_level: Optional[Literal['REPEATABLE_READ', 'SERIALIZABLE']]
87
90
  _dbms: Optional[Dbms]
91
+ _event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
88
92
 
89
93
  @classmethod
90
94
  def get(cls) -> Env:
@@ -96,6 +100,7 @@ class Env:
96
100
  def _init_env(cls, reinit_db: bool = False) -> None:
97
101
  assert not cls.__initializing, 'Circular env initialization detected.'
98
102
  cls.__initializing = True
103
+ cls._instance = None
99
104
  env = Env()
100
105
  env._set_up(reinit_db=reinit_db)
101
106
  env._upgrade_metadata()
@@ -139,7 +144,34 @@ class Env:
139
144
  self._resource_pool_info = {}
140
145
  self._current_conn = None
141
146
  self._current_session = None
147
+ self._current_isolation_level = None
142
148
  self._dbms = None
149
+ self._event_loop = None
150
+
151
+ def _init_event_loop(self) -> None:
152
+ try:
153
+ # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
154
+ # multiple run_until_complete()
155
+ running_loop = asyncio.get_running_loop()
156
+ self._event_loop = running_loop
157
+ _logger.debug('Patched running loop')
158
+ except RuntimeError:
159
+ self._event_loop = asyncio.new_event_loop()
160
+ asyncio.set_event_loop(self._event_loop)
161
+ # we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
162
+ self._event_loop.slow_callback_duration = 3600
163
+
164
+ # always allow nested event loops, we need that to run async udfs synchronously (eg, for SimilarityExpr);
165
+ # see run_coroutine_synchronously()
166
+ nest_asyncio.apply()
167
+ if _logger.isEnabledFor(logging.DEBUG):
168
+ self._event_loop.set_debug(True)
169
+
170
+ @property
171
+ def event_loop(self) -> asyncio.AbstractEventLoop:
172
+ if self._event_loop is None:
173
+ self._init_event_loop()
174
+ return self._event_loop
143
175
 
144
176
  @property
145
177
  def db_url(self) -> str:
@@ -201,20 +233,34 @@ class Env:
201
233
  return self._db_server is not None
202
234
 
203
235
  @contextmanager
204
- def begin_xact(self) -> Iterator[sql.Connection]:
205
- """Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly."""
236
+ def begin_xact(self, for_write: bool = False) -> Iterator[sql.Connection]:
237
+ """
238
+ Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
239
+
240
+ for_write: if True, uses serializable isolation; if False, uses repeatable_read
241
+
242
+ TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
243
+ that avoids tripping over any pending ops
244
+ """
206
245
  if self._current_conn is None:
207
246
  assert self._current_session is None
208
247
  try:
209
- with self.engine.begin() as conn, sql.orm.Session(conn) as session:
248
+ self._current_isolation_level = 'SERIALIZABLE' if for_write else 'REPEATABLE_READ'
249
+ with (
250
+ self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
251
+ sql.orm.Session(conn) as session,
252
+ conn.begin(),
253
+ ):
210
254
  self._current_conn = conn
211
255
  self._current_session = session
212
256
  yield conn
213
257
  finally:
214
258
  self._current_session = None
215
259
  self._current_conn = None
260
+ self._current_isolation_level = None
216
261
  else:
217
262
  assert self._current_session is not None
263
+ assert for_write == (self._current_isolation_level == 'serializable')
218
264
  yield self._current_conn
219
265
 
220
266
  def configure_logging(
@@ -90,7 +90,9 @@ class DataRowBatch:
90
90
  idx_range = slice(0, len(self.rows))
91
91
  for row in self.rows[idx_range]:
92
92
  for info in stored_img_info:
93
- filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.get().version))
93
+ col = info.col
94
+ assert col.tbl.id == self.tbl.id
95
+ filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
94
96
  row.flush_img(info.slot_idx, filepath)
95
97
  for slot_idx in flushed_slot_idxs:
96
98
  row.flush_img(slot_idx)
@@ -1,11 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import abc
4
- import asyncio
5
4
  import logging
6
5
  from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
7
6
 
8
7
  from pixeltable import exprs
8
+ from pixeltable.env import Env
9
9
 
10
10
  from .data_row_batch import DataRowBatch
11
11
  from .exec_context import ExecContext
@@ -59,26 +59,7 @@ class ExecNode(abc.ABC):
59
59
  pass
60
60
 
61
61
  def __iter__(self) -> Iterator[DataRowBatch]:
62
- running_loop: Optional[asyncio.AbstractEventLoop] = None
63
- loop: asyncio.AbstractEventLoop
64
- try:
65
- # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
66
- # multiple run_until_complete()
67
- running_loop = asyncio.get_running_loop()
68
- import nest_asyncio # type: ignore[import-untyped]
69
-
70
- nest_asyncio.apply()
71
- loop = running_loop
72
- _logger.debug('Patched running loop')
73
- except RuntimeError:
74
- loop = asyncio.new_event_loop()
75
- asyncio.set_event_loop(loop)
76
- # we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
77
- loop.slow_callback_duration = 3600
78
-
79
- if _logger.isEnabledFor(logging.DEBUG):
80
- loop.set_debug(True)
81
-
62
+ loop = Env.get().event_loop
82
63
  aiter = self.__aiter__()
83
64
  try:
84
65
  while True:
@@ -86,9 +67,11 @@ class ExecNode(abc.ABC):
86
67
  yield batch
87
68
  except StopAsyncIteration:
88
69
  pass
89
- finally:
90
- if loop != running_loop:
91
- loop.close()
70
+ # TODO:
71
+ # - we seem to have some tasks that aren't accounted for by ExprEvalNode and don't get cancelled by the time
72
+ # we end up here
73
+ # - however, blindly cancelling all pending tasks doesn't work when running in a jupyter environment, which
74
+ # creates tasks on its own
92
75
 
93
76
  def open(self) -> None:
94
77
  """Bottom-up initialization of nodes for execution. Must be called before __next__."""
@@ -4,9 +4,10 @@ import asyncio
4
4
  import datetime
5
5
  import inspect
6
6
  import logging
7
+ import re
7
8
  import sys
8
9
  import time
9
- from typing import Awaitable, Collection, Optional
10
+ from typing import Any, Awaitable, Collection, Optional
10
11
 
11
12
  from pixeltable import env, func
12
13
  from pixeltable.config import Config
@@ -250,8 +251,20 @@ class RequestRateScheduler(Scheduler):
250
251
  total_retried: int
251
252
 
252
253
  TIME_FORMAT = '%H:%M.%S %f'
253
- MAX_RETRIES = 10
254
+ MAX_RETRIES = 3
254
255
  DEFAULT_RATE_LIMIT = 600 # requests per minute
256
+ RATE_LIMIT_INDICATORS = ('rate limit', 'too many requests', '429', 'quota exceeded', 'throttled', 'rate exceeded')
257
+ RETRY_AFTER_PATTERNS = (
258
+ r'retry after (\d+(?:\.\d+)?)\s*seconds?',
259
+ r'try again in (\d+(?:\.\d+)?)\s*seconds?',
260
+ r'wait (\d+(?:\.\d+)?)\s*seconds?',
261
+ r'retry-after:\s*(\d+(?:\.\d+)?)',
262
+ )
263
+
264
+ # Exponential backoff defaults
265
+ BASE_RETRY_DELAY = 1.0 # in seconds
266
+ MAX_RETRY_DELAY = 60.0 # in seconds
267
+ RETRY_BACKOFF_MULTIPLIER = 2.0
255
268
 
256
269
  def __init__(self, resource_pool: str, dispatcher: Dispatcher):
257
270
  super().__init__(resource_pool, dispatcher)
@@ -337,11 +350,12 @@ class RequestRateScheduler(Scheduler):
337
350
  self.dispatcher.dispatch(request.rows, exec_ctx)
338
351
 
339
352
  except Exception as exc:
340
- # TODO: which exception can be retried?
341
- _logger.debug(f'exception for {self.resource_pool}: {exc}')
342
- status = getattr(exc, 'status', None)
343
- _logger.debug(f'type={type(exc)} has_status={hasattr(exc, "status")} status={status}')
344
- if num_retries < self.MAX_RETRIES:
353
+ _logger.debug(f'exception for {self.resource_pool}: type={type(exc)}\n{exc}')
354
+ is_rate_limit_error, retry_after = self._is_rate_limit_error(exc)
355
+ if is_rate_limit_error and num_retries < self.MAX_RETRIES:
356
+ retry_delay = self._compute_retry_delay(num_retries, retry_after)
357
+ _logger.debug(f'scheduler {self.resource_pool}: retrying after {retry_delay}')
358
+ await asyncio.sleep(retry_delay)
345
359
  self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
346
360
  return
347
361
 
@@ -358,6 +372,119 @@ class RequestRateScheduler(Scheduler):
358
372
  if is_task:
359
373
  self.num_in_flight -= 1
360
374
 
375
+ def _is_rate_limit_error(self, exc: Exception) -> tuple[bool, Optional[float]]:
376
+ """Returns True if the exception indicates a rate limit error, and the retry delay in seconds."""
377
+ from http import HTTPStatus
378
+
379
+ # Check for HTTP status TOO_MANY_REQUESTS in various exception classes.
380
+ # We look for attributes that contain status codes, instead of checking the type of the exception,
381
+ # in order to handle a wider variety of exception classes.
382
+ is_rate_limit_error = False
383
+ retry_delay: Optional[float] = None
384
+
385
+ # requests.HTTPError/httpx.HTTPStatusError
386
+ if (
387
+ hasattr(exc, 'response')
388
+ and hasattr(exc.response, 'status_code')
389
+ and exc.response.status_code == HTTPStatus.TOO_MANY_REQUESTS.value
390
+ ):
391
+ is_rate_limit_error = True
392
+ retry_delay = self._extract_retry_delay_from_headers(exc.response.headers)
393
+ elif (
394
+ # urllib.error.HTTPError
395
+ (hasattr(exc, 'code') and exc.code == HTTPStatus.TOO_MANY_REQUESTS.value)
396
+ # aiohttp.ClientResponseError
397
+ or (hasattr(exc, 'status') and exc.status == HTTPStatus.TOO_MANY_REQUESTS.value)
398
+ ) and hasattr(exc, 'headers'):
399
+ is_rate_limit_error = True
400
+ retry_delay = self._extract_retry_delay_from_headers(exc.headers)
401
+
402
+ if is_rate_limit_error:
403
+ return True, retry_delay
404
+
405
+ # Check common rate limit keywords in exception message
406
+ error_msg = str(exc).lower()
407
+ if any(indicator in error_msg for indicator in self.RATE_LIMIT_INDICATORS):
408
+ retry_delay = self._extract_retry_delay_from_message(error_msg)
409
+ return True, retry_delay
410
+
411
+ return False, None
412
+
413
+ def _extract_retry_delay_from_headers(self, headers: Optional[Any]) -> Optional[float]:
414
+ """Extract retry delay from HTTP headers."""
415
+ if headers is None:
416
+ return None
417
+
418
+ # convert headers to dict-like object for consistent access
419
+ header_dict: dict
420
+ if hasattr(headers, 'get'):
421
+ header_dict = headers
422
+ else:
423
+ # headers are a list of tuples or other format
424
+ try:
425
+ header_dict = dict(headers)
426
+ except (TypeError, ValueError):
427
+ return None
428
+ # normalize dict keys: lowercase and remove dashes
429
+ header_dict = {k.lower().replace('-', ''): v for k, v in header_dict.items()}
430
+
431
+ # check Retry-After header
432
+ retry_after = header_dict.get('retryafter')
433
+ if retry_after is not None:
434
+ try:
435
+ return float(retry_after)
436
+ except (ValueError, TypeError):
437
+ pass
438
+
439
+ # check X-RateLimit-Reset (Unix timestamp)
440
+ reset_time = header_dict.get('xratelimitreset')
441
+ if reset_time is not None:
442
+ try:
443
+ reset_timestamp = float(reset_time)
444
+ delay = max(0, reset_timestamp - time.time())
445
+ return delay
446
+ except (ValueError, TypeError):
447
+ pass
448
+
449
+ # check X-RateLimit-Reset-After (seconds from now)
450
+ reset_after = header_dict.get('xratelimitresetafter')
451
+ if reset_after is not None:
452
+ try:
453
+ return float(reset_after)
454
+ except (ValueError, TypeError):
455
+ pass
456
+
457
+ return None
458
+
459
+ def _extract_retry_delay_from_message(self, msg: str) -> Optional[float]:
460
+ msg_lower = msg.lower()
461
+ for pattern in self.RETRY_AFTER_PATTERNS:
462
+ match = re.search(pattern, msg_lower)
463
+ if match is not None:
464
+ try:
465
+ return float(match.group(1))
466
+ except (ValueError, TypeError):
467
+ continue
468
+ return None
469
+
470
+ def _compute_retry_delay(self, num_retries: int, retry_after: Optional[float] = None) -> float:
471
+ """
472
+ Calculate exponential backoff delay for rate limit errors.
473
+
474
+ Args:
475
+ retry_count: Number of retries attempted (0-based)
476
+ retry_after: Suggested delay from Retry-After header
477
+
478
+ Returns:
479
+ Delay in seconds
480
+ """
481
+ if retry_after is not None and retry_after > 0:
482
+ # Use server-suggested delay, but cap it at max_delay
483
+ return max(min(retry_after, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
484
+ else:
485
+ delay = self.BASE_RETRY_DELAY * (self.RETRY_BACKOFF_MULTIPLIER**num_retries)
486
+ return max(min(delay, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
487
+
361
488
 
362
489
  # all concrete Scheduler subclasses that implement matches()
363
490
  SCHEDULERS = [RateLimitsScheduler, RequestRateScheduler]
@@ -63,13 +63,12 @@ class InMemoryDataNode(ExecNode):
63
63
  for col_name, val in input_row.items():
64
64
  col_info = user_cols_by_name.get(col_name)
65
65
  assert col_info is not None
66
-
67
- if col_info.col.col_type.is_image_type() and isinstance(val, bytes):
68
- # this is a literal image, ie, a sequence of bytes; we save this as a media file and store the path
69
- path = str(MediaStore.prepare_media_path(self.tbl.id, col_info.col.id, self.tbl.get().version))
70
- with open(path, 'wb') as fp:
71
- fp.write(val)
72
- self.output_rows[row_idx][col_info.slot_idx] = path
66
+ col = col_info.col
67
+ if col.col_type.is_image_type() and isinstance(val, bytes):
68
+ # this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
69
+ assert col.tbl.id == self.tbl.id
70
+ path = MediaStore.save_media_file(val, col.tbl.id, col.id, col.tbl.version)
71
+ self.output_rows[row_idx][col_info.slot_idx] = str(path)
73
72
  else:
74
73
  self.output_rows[row_idx][col_info.slot_idx] = val
75
74
 
@@ -26,6 +26,7 @@ class ColumnPropertyRef(Expr):
26
26
  ERRORMSG = 1
27
27
  FILEURL = 2
28
28
  LOCALPATH = 3
29
+ CELLMD = 4 # JSON metadata for the cell, e.g. errortype, errormsg for media columns
29
30
 
30
31
  def __init__(self, col_ref: ColumnRef, prop: Property):
31
32
  super().__init__(ts.StringType(nullable=True))
@@ -51,8 +52,8 @@ class ColumnPropertyRef(Expr):
51
52
  def __repr__(self) -> str:
52
53
  return f'{self._col_ref}.{self.prop.name.lower()}'
53
54
 
54
- def is_error_prop(self) -> bool:
55
- return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG)
55
+ def is_cellmd_prop(self) -> bool:
56
+ return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG, self.Property.CELLMD)
56
57
 
57
58
  def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
58
59
  if not self._col_ref.col_handle.get().is_stored:
@@ -63,21 +64,27 @@ class ColumnPropertyRef(Expr):
63
64
  if (
64
65
  col.col_type.is_media_type()
65
66
  and col.media_validation == catalog.MediaValidation.ON_READ
66
- and self.is_error_prop()
67
+ and self.is_cellmd_prop()
67
68
  ):
68
69
  return None
69
70
 
70
71
  if self.prop == self.Property.ERRORTYPE:
71
- assert col.sa_errortype_col is not None
72
- return col.sa_errortype_col
72
+ return col.sa_cellmd_col.op('->>')('errortype')
73
73
  if self.prop == self.Property.ERRORMSG:
74
- assert col.sa_errormsg_col is not None
75
- return col.sa_errormsg_col
74
+ return col.sa_cellmd_col.op('->>')('errormsg')
75
+ if self.prop == self.Property.CELLMD:
76
+ assert col.sa_cellmd_col is not None
77
+ return col.sa_cellmd_col
76
78
  if self.prop == self.Property.FILEURL:
77
79
  # the file url is stored as the column value
78
80
  return sql_elements.get(self._col_ref)
79
81
  return None
80
82
 
83
+ @classmethod
84
+ def create_cellmd_exc(cls, exc: Exception) -> dict[str, str]:
85
+ """Create a cellmd value from an exception."""
86
+ return {'errortype': type(exc).__name__, 'errormsg': str(exc)}
87
+
81
88
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
82
89
  if self.prop == self.Property.FILEURL:
83
90
  assert data_row.has_val[self._col_ref.slot_idx]
@@ -87,14 +94,19 @@ class ColumnPropertyRef(Expr):
87
94
  assert data_row.has_val[self._col_ref.slot_idx]
88
95
  data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
89
96
  return
90
- elif self.is_error_prop():
97
+ elif self.is_cellmd_prop():
91
98
  exc = data_row.get_exc(self._col_ref.slot_idx)
92
99
  if exc is None:
93
100
  data_row[self.slot_idx] = None
94
101
  elif self.prop == self.Property.ERRORTYPE:
95
102
  data_row[self.slot_idx] = type(exc).__name__
96
- else:
103
+ elif self.prop == self.Property.ERRORMSG:
97
104
  data_row[self.slot_idx] = str(exc)
105
+ elif self.prop == self.Property.CELLMD:
106
+ data_row[self.slot_idx] = self.create_cellmd_exc(exc)
107
+ else:
108
+ raise AssertionError(f'Unknown property {self.prop}')
109
+ return
98
110
  else:
99
111
  raise AssertionError()
100
112
 
@@ -115,11 +115,15 @@ class ColumnRef(Expr):
115
115
  from .column_property_ref import ColumnPropertyRef
116
116
 
117
117
  # resolve column properties
118
+ if name == ColumnPropertyRef.Property.CELLMD.name.lower():
119
+ # This is not user accessible, but used internally to store cell metadata
120
+ return super().__getattr__(name)
121
+
118
122
  if (
119
123
  name == ColumnPropertyRef.Property.ERRORTYPE.name.lower()
120
124
  or name == ColumnPropertyRef.Property.ERRORMSG.name.lower()
121
125
  ):
122
- property_is_present = self.col.is_stored and (self.col.is_computed or self.col_type.is_media_type())
126
+ property_is_present = self.col.stores_cellmd
123
127
  if not property_is_present:
124
128
  raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
125
129
  return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
@@ -321,7 +325,8 @@ class ColumnRef(Expr):
321
325
  @classmethod
322
326
  def get_column(cls, d: dict) -> catalog.Column:
323
327
  tbl_id, version, col_id = UUID(d['tbl_id']), d['tbl_version'], d['col_id']
324
- tbl_version = catalog.Catalog.get().get_tbl_version(tbl_id, version)
328
+ # validate_initialized=False: this gets called as part of TableVersion.init()
329
+ tbl_version = catalog.Catalog.get().get_tbl_version(tbl_id, version, validate_initialized=False)
325
330
  # don't use tbl_version.cols_by_id here, this might be a snapshot reference to a column that was then dropped
326
331
  col = next(col for col in tbl_version.cols if col.id == col_id)
327
332
  return col
@@ -446,11 +446,11 @@ class FunctionCall(Expr):
446
446
  dedent(
447
447
  f"""
448
448
  The UDF '{fn.self_path}' cannot be located, because
449
- {{errormsg}}
449
+ {{error_msg}}
450
450
  """
451
451
  )
452
452
  .strip()
453
- .format(errormsg=fn.errormsg)
453
+ .format(error_msg=fn.error_msg)
454
454
  )
455
455
  return cls(fn, args, kwargs, return_type, is_method_call=is_method_call, validation_error=validation_error)
456
456
 
@@ -209,7 +209,7 @@ class RowBuilder:
209
209
  # this is input and therefore doesn't depend on other exprs
210
210
  continue
211
211
  # error properties don't have exceptions themselves
212
- if isinstance(expr, ColumnPropertyRef) and expr.is_error_prop():
212
+ if isinstance(expr, ColumnPropertyRef) and expr.is_cellmd_prop():
213
213
  continue
214
214
  dependency_idxs = [d.slot_idx for d in expr.dependencies()]
215
215
  self.dependencies[expr.slot_idx, dependency_idxs] = True
@@ -444,6 +444,8 @@ class RowBuilder:
444
444
  Return tuple[list of row values in `self.table_columns` order, # of exceptions]
445
445
  This excludes system columns.
446
446
  """
447
+ from pixeltable.exprs.column_property_ref import ColumnPropertyRef
448
+
447
449
  num_excs = 0
448
450
  table_row: list[Any] = list(pk)
449
451
  for info in self.table_columns:
@@ -454,9 +456,9 @@ class RowBuilder:
454
456
  if cols_with_excs is not None:
455
457
  cols_with_excs.add(col.id)
456
458
  table_row.append(None)
457
- if col.records_errors:
458
- # exceptions get stored in the errortype/-msg columns
459
- table_row.extend((type(exc).__name__, str(exc)))
459
+ if col.stores_cellmd:
460
+ # exceptions get stored in the errortype/-msg properties of the cellmd column
461
+ table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
460
462
  else:
461
463
  if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
462
464
  # we have yet to store this image
@@ -464,8 +466,8 @@ class RowBuilder:
464
466
  data_row.flush_img(slot_idx, filepath)
465
467
  val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
466
468
  table_row.append(val)
467
- if col.records_errors:
468
- table_row.extend((None, None))
469
+ if col.stores_cellmd:
470
+ table_row.append(None) # placeholder for cellmd column
469
471
 
470
472
  return table_row, num_excs
471
473
 
@@ -483,8 +485,7 @@ class RowBuilder:
483
485
  if col.col.col_type.is_media_type():
484
486
  media_cols[len(store_col_names)] = col.col
485
487
  store_col_names.append(col.col.store_name())
486
- if col.col.records_errors:
487
- store_col_names.append(col.col.errortype_store_name())
488
- store_col_names.append(col.col.errormsg_store_name())
488
+ if col.col.stores_cellmd:
489
+ store_col_names.append(col.col.cellmd_store_name())
489
490
 
490
491
  return store_col_names, media_cols
@@ -105,10 +105,6 @@ class RowidRef(Expr):
105
105
  assert self.rowid_component_idx <= len(rowid_cols), (
106
106
  f'{self.rowid_component_idx} not consistent with {rowid_cols}'
107
107
  )
108
- # _logger.debug(
109
- # f'RowidRef.sql_expr: tbl={tbl.id}{tbl.effective_version} sa_tbl={id(tbl.store_tbl.sa_tbl):x} '
110
- # f'tv={id(tbl):x}'
111
- # )
112
108
  return rowid_cols[self.rowid_component_idx]
113
109
 
114
110
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -504,12 +504,12 @@ class Function(ABC):
504
504
 
505
505
  class InvalidFunction(Function):
506
506
  fn_dict: dict[str, Any]
507
- errormsg: str
507
+ error_msg: str
508
508
 
509
- def __init__(self, self_path: str, fn_dict: dict[str, Any], errormsg: str):
509
+ def __init__(self, self_path: str, fn_dict: dict[str, Any], error_msg: str):
510
510
  super().__init__([], self_path)
511
511
  self.fn_dict = fn_dict
512
- self.errormsg = errormsg
512
+ self.error_msg = error_msg
513
513
 
514
514
  def _as_dict(self) -> dict:
515
515
  """
@@ -1,14 +1,5 @@
1
1
  """
2
2
  Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `AudioType`.
3
-
4
- Example:
5
- ```python
6
- import pixeltable as pxt
7
- import pixeltable.functions as pxtf
8
-
9
- t = pxt.get_table(...)
10
- t.select(pxtf.audio.get_metadata()).collect()
11
- ```
12
3
  """
13
4
 
14
5
  import pixeltable as pxt
@@ -19,6 +10,42 @@ from pixeltable.utils.code import local_public_names
19
10
  def get_metadata(audio: pxt.Audio) -> dict:
20
11
  """
21
12
  Gets various metadata associated with an audio file and returns it as a dictionary.
13
+
14
+ Args:
15
+ audio: The audio to get metadata for.
16
+
17
+ Returns:
18
+ A `dict` such as the following:
19
+
20
+ ```json
21
+ {
22
+ 'size': 2568827,
23
+ 'streams': [
24
+ {
25
+ 'type': 'audio',
26
+ 'frames': 0,
27
+ 'duration': 2646000,
28
+ 'metadata': {},
29
+ 'time_base': 2.2675736961451248e-05,
30
+ 'codec_context': {
31
+ 'name': 'flac',
32
+ 'profile': None,
33
+ 'channels': 1,
34
+ 'codec_tag': '\\x00\\x00\\x00\\x00',
35
+ },
36
+ 'duration_seconds': 60.0,
37
+ }
38
+ ],
39
+ 'bit_rate': 342510,
40
+ 'metadata': {'encoder': 'Lavf61.1.100'},
41
+ 'bit_exact': False,
42
+ }
43
+ ```
44
+
45
+ Examples:
46
+ Extract metadata for files in the `audio_col` column of the table `tbl`:
47
+
48
+ >>> tbl.select(tbl.audio_col.get_metadata()).collect()
22
49
  """
23
50
  return pxt.functions.video._get_metadata(audio)
24
51
 
@@ -7,7 +7,6 @@ the [Working with Gemini](https://pixeltable.readme.io/docs/working-with-gemini)
7
7
 
8
8
  import asyncio
9
9
  import io
10
- import tempfile
11
10
  from pathlib import Path
12
11
  from typing import TYPE_CHECKING, Optional
13
12
 
@@ -215,9 +214,10 @@ async def generate_videos(
215
214
  video_bytes = await _genai_client().aio.files.download(file=video.video) # type: ignore[arg-type]
216
215
  assert video_bytes is not None
217
216
 
218
- _, output_filename = tempfile.mkstemp(suffix='.mp4', dir=str(env.Env.get().tmp_dir))
219
- Path(output_filename).write_bytes(video_bytes)
220
- return output_filename
217
+ # Create a temporary file to store the video bytes
218
+ output_path = env.Env.get().create_tmp_path('.mp4')
219
+ Path(output_path).write_bytes(video_bytes)
220
+ return str(output_path)
221
221
 
222
222
 
223
223
  @generate_videos.resource_pool
@@ -13,7 +13,6 @@ import logging
13
13
  import math
14
14
  import pathlib
15
15
  import re
16
- import uuid
17
16
  from typing import TYPE_CHECKING, Any, Callable, Optional, Type
18
17
 
19
18
  import httpx
@@ -207,7 +206,7 @@ async def speech(input: str, *, model: str, voice: str, model_kwargs: Optional[d
207
206
 
208
207
  content = await _openai_client().audio.speech.create(input=input, model=model, voice=voice, **model_kwargs)
209
208
  ext = model_kwargs.get('response_format', 'mp3')
210
- output_filename = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}.{ext}')
209
+ output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
211
210
  content.write_to_file(output_filename)
212
211
  return output_filename
213
212