pixeltable 0.4.18__py3-none-any.whl → 0.4.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (152) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/catalog.py +119 -100
  4. pixeltable/catalog/column.py +104 -115
  5. pixeltable/catalog/globals.py +1 -2
  6. pixeltable/catalog/insertable_table.py +44 -49
  7. pixeltable/catalog/path.py +3 -4
  8. pixeltable/catalog/schema_object.py +4 -4
  9. pixeltable/catalog/table.py +118 -122
  10. pixeltable/catalog/table_metadata.py +6 -6
  11. pixeltable/catalog/table_version.py +322 -257
  12. pixeltable/catalog/table_version_handle.py +4 -4
  13. pixeltable/catalog/table_version_path.py +9 -10
  14. pixeltable/catalog/tbl_ops.py +9 -3
  15. pixeltable/catalog/view.py +34 -28
  16. pixeltable/config.py +14 -10
  17. pixeltable/dataframe.py +68 -77
  18. pixeltable/env.py +74 -64
  19. pixeltable/exec/aggregation_node.py +6 -6
  20. pixeltable/exec/cache_prefetch_node.py +10 -10
  21. pixeltable/exec/data_row_batch.py +3 -3
  22. pixeltable/exec/exec_context.py +4 -5
  23. pixeltable/exec/exec_node.py +5 -5
  24. pixeltable/exec/expr_eval/evaluators.py +6 -6
  25. pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
  26. pixeltable/exec/expr_eval/globals.py +6 -6
  27. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  28. pixeltable/exec/expr_eval/schedulers.py +11 -11
  29. pixeltable/exec/in_memory_data_node.py +2 -2
  30. pixeltable/exec/object_store_save_node.py +14 -17
  31. pixeltable/exec/sql_node.py +25 -25
  32. pixeltable/exprs/arithmetic_expr.py +4 -4
  33. pixeltable/exprs/array_slice.py +2 -2
  34. pixeltable/exprs/column_property_ref.py +3 -3
  35. pixeltable/exprs/column_ref.py +61 -74
  36. pixeltable/exprs/comparison.py +5 -5
  37. pixeltable/exprs/compound_predicate.py +3 -3
  38. pixeltable/exprs/data_row.py +12 -12
  39. pixeltable/exprs/expr.py +41 -31
  40. pixeltable/exprs/expr_dict.py +3 -3
  41. pixeltable/exprs/expr_set.py +3 -3
  42. pixeltable/exprs/function_call.py +14 -14
  43. pixeltable/exprs/in_predicate.py +4 -4
  44. pixeltable/exprs/inline_expr.py +8 -8
  45. pixeltable/exprs/is_null.py +1 -3
  46. pixeltable/exprs/json_mapper.py +8 -8
  47. pixeltable/exprs/json_path.py +6 -6
  48. pixeltable/exprs/literal.py +5 -5
  49. pixeltable/exprs/method_ref.py +2 -2
  50. pixeltable/exprs/object_ref.py +2 -2
  51. pixeltable/exprs/row_builder.py +14 -14
  52. pixeltable/exprs/rowid_ref.py +8 -8
  53. pixeltable/exprs/similarity_expr.py +50 -25
  54. pixeltable/exprs/sql_element_cache.py +4 -4
  55. pixeltable/exprs/string_op.py +2 -2
  56. pixeltable/exprs/type_cast.py +3 -5
  57. pixeltable/func/aggregate_function.py +8 -8
  58. pixeltable/func/callable_function.py +9 -9
  59. pixeltable/func/expr_template_function.py +3 -3
  60. pixeltable/func/function.py +15 -17
  61. pixeltable/func/function_registry.py +6 -7
  62. pixeltable/func/globals.py +2 -3
  63. pixeltable/func/mcp.py +2 -2
  64. pixeltable/func/query_template_function.py +16 -16
  65. pixeltable/func/signature.py +14 -14
  66. pixeltable/func/tools.py +11 -11
  67. pixeltable/func/udf.py +16 -18
  68. pixeltable/functions/__init__.py +1 -0
  69. pixeltable/functions/anthropic.py +7 -7
  70. pixeltable/functions/audio.py +76 -0
  71. pixeltable/functions/bedrock.py +6 -6
  72. pixeltable/functions/deepseek.py +4 -4
  73. pixeltable/functions/fireworks.py +2 -2
  74. pixeltable/functions/gemini.py +6 -6
  75. pixeltable/functions/globals.py +12 -12
  76. pixeltable/functions/groq.py +4 -4
  77. pixeltable/functions/huggingface.py +18 -20
  78. pixeltable/functions/image.py +7 -10
  79. pixeltable/functions/llama_cpp.py +7 -7
  80. pixeltable/functions/math.py +2 -3
  81. pixeltable/functions/mistralai.py +3 -3
  82. pixeltable/functions/ollama.py +9 -9
  83. pixeltable/functions/openai.py +21 -21
  84. pixeltable/functions/openrouter.py +7 -7
  85. pixeltable/functions/string.py +21 -28
  86. pixeltable/functions/timestamp.py +7 -8
  87. pixeltable/functions/together.py +4 -6
  88. pixeltable/functions/twelvelabs.py +92 -0
  89. pixeltable/functions/video.py +2 -24
  90. pixeltable/functions/vision.py +6 -6
  91. pixeltable/functions/whisper.py +7 -7
  92. pixeltable/functions/whisperx.py +16 -16
  93. pixeltable/globals.py +52 -36
  94. pixeltable/index/base.py +12 -8
  95. pixeltable/index/btree.py +19 -22
  96. pixeltable/index/embedding_index.py +30 -39
  97. pixeltable/io/datarows.py +3 -3
  98. pixeltable/io/external_store.py +13 -16
  99. pixeltable/io/fiftyone.py +5 -5
  100. pixeltable/io/globals.py +5 -5
  101. pixeltable/io/hf_datasets.py +4 -4
  102. pixeltable/io/label_studio.py +12 -12
  103. pixeltable/io/pandas.py +6 -6
  104. pixeltable/io/parquet.py +2 -2
  105. pixeltable/io/table_data_conduit.py +12 -12
  106. pixeltable/io/utils.py +2 -2
  107. pixeltable/iterators/audio.py +2 -2
  108. pixeltable/iterators/video.py +8 -13
  109. pixeltable/metadata/converters/convert_18.py +2 -2
  110. pixeltable/metadata/converters/convert_19.py +2 -2
  111. pixeltable/metadata/converters/convert_20.py +2 -2
  112. pixeltable/metadata/converters/convert_21.py +2 -2
  113. pixeltable/metadata/converters/convert_22.py +2 -2
  114. pixeltable/metadata/converters/convert_24.py +2 -2
  115. pixeltable/metadata/converters/convert_25.py +2 -2
  116. pixeltable/metadata/converters/convert_26.py +2 -2
  117. pixeltable/metadata/converters/convert_29.py +4 -4
  118. pixeltable/metadata/converters/convert_34.py +2 -2
  119. pixeltable/metadata/converters/convert_36.py +2 -2
  120. pixeltable/metadata/converters/convert_38.py +2 -2
  121. pixeltable/metadata/converters/convert_39.py +1 -2
  122. pixeltable/metadata/converters/util.py +11 -13
  123. pixeltable/metadata/schema.py +22 -21
  124. pixeltable/metadata/utils.py +2 -6
  125. pixeltable/mypy/mypy_plugin.py +5 -5
  126. pixeltable/plan.py +30 -28
  127. pixeltable/share/packager.py +7 -7
  128. pixeltable/share/publish.py +3 -3
  129. pixeltable/store.py +125 -61
  130. pixeltable/type_system.py +43 -46
  131. pixeltable/utils/__init__.py +1 -2
  132. pixeltable/utils/arrow.py +4 -4
  133. pixeltable/utils/av.py +8 -0
  134. pixeltable/utils/azure_store.py +305 -0
  135. pixeltable/utils/code.py +1 -2
  136. pixeltable/utils/dbms.py +15 -19
  137. pixeltable/utils/description_helper.py +2 -3
  138. pixeltable/utils/documents.py +5 -6
  139. pixeltable/utils/exception_handler.py +2 -2
  140. pixeltable/utils/filecache.py +5 -5
  141. pixeltable/utils/formatter.py +4 -6
  142. pixeltable/utils/gcs_store.py +9 -9
  143. pixeltable/utils/local_store.py +17 -17
  144. pixeltable/utils/object_stores.py +59 -43
  145. pixeltable/utils/s3_store.py +35 -30
  146. {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/METADATA +1 -1
  147. pixeltable-0.4.19.dist-info/RECORD +213 -0
  148. pixeltable/__version__.py +0 -3
  149. pixeltable-0.4.18.dist-info/RECORD +0 -211
  150. {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
  151. {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
  152. {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
pixeltable/store.py CHANGED
@@ -3,8 +3,9 @@ from __future__ import annotations
3
3
  import abc
4
4
  import logging
5
5
  import sys
6
+ import time
6
7
  import warnings
7
- from typing import Any, Iterable, Iterator, Optional
8
+ from typing import Any, Iterable, Iterator
8
9
 
9
10
  import more_itertools
10
11
  import psycopg
@@ -33,11 +34,11 @@ class StoreBase:
33
34
 
34
35
  tbl_version: catalog.TableVersionHandle
35
36
  sa_md: sql.MetaData
36
- sa_tbl: Optional[sql.Table]
37
+ sa_tbl: sql.Table | None
37
38
  _pk_cols: list[sql.Column]
38
39
  v_min_col: sql.Column
39
40
  v_max_col: sql.Column
40
- base: Optional[StoreBase]
41
+ base: StoreBase | None
41
42
 
42
43
  # In my cursory experiments this was the optimal batch size: it was an improvement over 5_000 and there was no real
43
44
  # benefit to going higher.
@@ -79,12 +80,13 @@ class StoreBase:
79
80
  self._pk_cols = [*rowid_cols, self.v_min_col]
80
81
  return [*rowid_cols, self.v_min_col, self.v_max_col]
81
82
 
82
- def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
83
+ def create_sa_tbl(self, tbl_version: catalog.TableVersion | None = None) -> None:
83
84
  """Create self.sa_tbl from self.tbl_version."""
84
85
  if tbl_version is None:
85
86
  tbl_version = self.tbl_version.get()
86
87
  system_cols = self._create_system_columns()
87
88
  all_cols = system_cols.copy()
89
+ # we captured all columns, including dropped ones: they're still part of the physical table
88
90
  for col in [c for c in tbl_version.cols if c.is_stored]:
89
91
  # re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
90
92
  # to the last sql.Table version we created and cannot be reused
@@ -111,7 +113,10 @@ class StoreBase:
111
113
  idx_name = f'vmax_idx_{tbl_version.id.hex}'
112
114
  idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
113
115
 
114
- # TODO: Include indices to ensure a completely accurate SA table definition?
116
+ # we only capture indices visible in this version
117
+ for idx_info in tbl_version.idxs.values():
118
+ idx = idx_info.idx.sa_index(tbl_version._store_idx_name(idx_info.id), idx_info.val_col)
119
+ idxs.append(idx)
115
120
 
116
121
  self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
117
122
  # _logger.debug(f'created sa tbl for {tbl_version.id!s} (sa_tbl={id(self.sa_tbl):x}, tv={id(tbl_version):x})')
@@ -137,35 +142,122 @@ class StoreBase:
137
142
  assert isinstance(result, int)
138
143
  return result
139
144
 
145
+ def _exec_if_not_exists(self, stmt: str, wait_for_table: bool) -> None:
146
+ """
147
+ Execute a statement containing 'IF NOT EXISTS' and ignore any duplicate object-related errors.
148
+
149
+ The statement needs to run in a separate transaction, because the expected error conditions will abort the
150
+ enclosing transaction (and the ability to run additional statements in that same transaction).
151
+ """
152
+ while True:
153
+ with Env.get().begin_xact(for_write=True) as conn:
154
+ try:
155
+ if wait_for_table:
156
+ # Try to lock the table to make sure that it exists. This needs to run in the same transaction
157
+ # as 'stmt' to avoid a race condition.
158
+ # TODO: adapt this for CockroachDB
159
+ lock_stmt = f'LOCK TABLE {self._storage_name()} IN ACCESS EXCLUSIVE MODE'
160
+ conn.execute(sql.text(lock_stmt))
161
+ conn.execute(sql.text(stmt))
162
+ return
163
+ except (sql.exc.IntegrityError, sql.exc.ProgrammingError) as e:
164
+ Env.get().console_logger.info(f'{stmt} failed with: {e}')
165
+ if (
166
+ isinstance(e.orig, psycopg.errors.UniqueViolation)
167
+ and 'duplicate key value violates unique constraint' in str(e.orig)
168
+ ) or (
169
+ isinstance(e.orig, (psycopg.errors.DuplicateObject, psycopg.errors.DuplicateTable))
170
+ and 'already exists' in str(e.orig)
171
+ ):
172
+ # table already exists
173
+ return
174
+ elif isinstance(e.orig, psycopg.errors.UndefinedTable):
175
+ # the Lock Table failed because the table doesn't exist yet; try again
176
+ time.sleep(1)
177
+ continue
178
+ else:
179
+ raise
180
+
181
+ def _store_tbl_exists(self) -> bool:
182
+ """Returns True if the store table exists, False otherwise."""
183
+ with Env.get().begin_xact(for_write=False) as conn:
184
+ q = (
185
+ 'SELECT COUNT(*) FROM pg_catalog.pg_tables '
186
+ f"WHERE schemaname = 'public' AND tablename = {self._storage_name()!r}"
187
+ )
188
+ res = conn.execute(sql.text(q)).scalar_one()
189
+ return res == 1
190
+
140
191
  def create(self) -> None:
141
- """Create If Not Exists for this table"""
192
+ """
193
+ Create or update store table to bring it in sync with self.sa_tbl. Idempotent.
194
+
195
+ This runs a sequence of DDL statements (Create Table, Alter Table Add Column, Create Index), each of which
196
+ is run in its own transaction.
197
+
198
+ The exception to that are local replicas, for which TableRestorer creates an enclosing transaction. In theory,
199
+ this should avoid the potential for race conditions that motivate the error handling present in
200
+ _exec_if_not_exists() (meaning: we shouldn't see those errors when creating local replicas).
201
+ TODO: remove the special case for local replicas in order to make the logic easier to reason about.
202
+ """
203
+ postgres_dialect = sql.dialects.postgresql.dialect()
204
+
205
+ if not self._store_tbl_exists():
206
+ # run Create Table If Not Exists; we always need If Not Exists to avoid race conditions between concurrent
207
+ # Pixeltable processes
208
+ create_stmt = sql.schema.CreateTable(self.sa_tbl, if_not_exists=True).compile(dialect=postgres_dialect)
209
+ self._exec_if_not_exists(str(create_stmt), wait_for_table=False)
210
+ else:
211
+ # ensure that all columns exist by running Alter Table Add Column If Not Exists for all columns
212
+ for col in self.sa_tbl.columns:
213
+ stmt = self._add_column_stmt(col)
214
+ self._exec_if_not_exists(stmt, wait_for_table=True)
215
+ # TODO: do we also need to ensure that these columns are now visible (ie, is there another potential race
216
+ # condition here?)
217
+
218
+ # ensure that all visible indices exist by running Create Index If Not Exists
219
+ for index in self.sa_tbl.indexes:
220
+ create_stmt = sql.schema.CreateIndex(index, if_not_exists=True).compile(dialect=postgres_dialect)
221
+ self._exec_if_not_exists(str(create_stmt), wait_for_table=True)
222
+
223
+ def create_index(self, idx_id: int) -> None:
224
+ """Create If Not Exists for this index"""
225
+ idx_info = self.tbl_version.get().idxs[idx_id]
226
+ sa_idx = idx_info.idx.sa_index(self.tbl_version.get()._store_idx_name(idx_id), idx_info.val_col)
142
227
  conn = Env.get().conn
143
- stmt = sql.schema.CreateTable(self.sa_tbl).compile(conn)
228
+ stmt = sql.schema.CreateIndex(sa_idx, if_not_exists=True).compile(conn)
144
229
  create_stmt = str(stmt)
145
- if_not_exists_stmt = create_stmt.replace('CREATE TABLE', 'CREATE TABLE IF NOT EXISTS')
146
-
147
- # Postgres seems not to handle concurrent Create Table If Not Exists correctly, we need to ignore the various
148
- # errors that can occur when two connections run the same Create Table statement.
149
- try:
150
- conn.execute(sql.text(if_not_exists_stmt))
151
- except (sql.exc.IntegrityError, sql.exc.ProgrammingError) as e:
152
- Env.get().console_logger.info(f'StoreBase.create() failed with: {e}')
153
- if (
154
- isinstance(e.orig, psycopg.errors.UniqueViolation)
155
- and 'duplicate key value violates unique constraint "pg_type_typname_nsp_index"' in str(e.orig)
156
- ) or (
157
- isinstance(e.orig, (psycopg.errors.DuplicateObject, psycopg.errors.DuplicateTable))
158
- and 'already exists' in str(e.orig)
159
- ):
160
- pass
161
- else:
162
- raise
230
+ self._exec_if_not_exists(create_stmt, wait_for_table=True)
231
+
232
+ def validate(self) -> None:
233
+ """Validate store table against self.table_version"""
234
+ with Env.get().begin_xact() as conn:
235
+ # check that all columns are present
236
+ q = f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r}'
237
+ store_col_info = {row[0] for row in conn.execute(sql.text(q)).fetchall()}
238
+ tbl_col_info = {col.store_name() for col in self.tbl_version.get().cols if col.is_stored}
239
+ assert tbl_col_info.issubset(store_col_info)
240
+
241
+ # check that all visible indices are present
242
+ q = f'SELECT indexname FROM pg_indexes WHERE tablename = {self._storage_name()!r}'
243
+ store_idx_names = {row[0] for row in conn.execute(sql.text(q)).fetchall()}
244
+ tbl_index_names = {
245
+ self.tbl_version.get()._store_idx_name(info.id) for info in self.tbl_version.get().idxs.values()
246
+ }
247
+ assert tbl_index_names.issubset(store_idx_names)
163
248
 
164
249
  def drop(self) -> None:
165
250
  """Drop store table"""
166
251
  conn = Env.get().conn
167
252
  self.sa_md.drop_all(bind=conn)
168
253
 
254
+ def _add_column_stmt(self, sa_col: sql.Column) -> str:
255
+ col_type_str = sa_col.type.compile(dialect=sql.dialects.postgresql.dialect())
256
+ return (
257
+ f'ALTER TABLE {self._storage_name()} ADD COLUMN IF NOT EXISTS '
258
+ f'{sa_col.name} {col_type_str} {"NOT " if not sa_col.nullable else ""} NULL'
259
+ )
260
+
169
261
  def add_column(self, col: catalog.Column) -> None:
170
262
  """Add column(s) to the store-resident table based on a catalog column
171
263
 
@@ -174,7 +266,7 @@ class StoreBase:
174
266
  """
175
267
  assert col.is_stored
176
268
  conn = Env.get().conn
177
- col_type_str = col.get_sa_col_type().compile(dialect=conn.dialect)
269
+ col_type_str = col.sa_col_type.compile(dialect=conn.dialect)
178
270
  s_txt = f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.store_name()} {col_type_str} NULL'
179
271
  added_storage_cols = [col.store_name()]
180
272
  if col.stores_cellmd:
@@ -197,34 +289,6 @@ class StoreBase:
197
289
  log_stmt(_logger, stmt)
198
290
  Env.get().conn.execute(stmt)
199
291
 
200
- def ensure_updated_schema(self) -> None:
201
- from pixeltable.utils.dbms import PostgresqlDbms
202
-
203
- # This should only be called during replica creation where the underlying DBMS is Postgres.
204
- assert isinstance(Env.get().dbms, PostgresqlDbms)
205
-
206
- conn = Env.get().conn
207
- tv = self.tbl_version.get()
208
-
209
- # Ensure columns exist
210
- sql_text = f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r}'
211
- result = conn.execute(sql.text(sql_text))
212
- existing_cols = {row[0] for row in result}
213
- for col in tv.cols:
214
- if col.is_stored and col.store_name() not in existing_cols:
215
- _logger.debug(f'Adding missing column {col.store_name()!r} to store table {self._storage_name()!r}')
216
- self.add_column(col)
217
-
218
- # Ensure indices exist
219
- sql_text = f'SELECT indexname FROM pg_indexes WHERE tablename = {self._storage_name()!r}'
220
- result = conn.execute(sql.text(sql_text))
221
- existing_idxs = {row[0] for row in result}
222
- for idx_name, idx_info in tv.all_idxs.items():
223
- store_name = tv._store_idx_name(idx_info.id)
224
- if store_name not in existing_idxs:
225
- _logger.debug(f'Creating missing index {idx_name!r} on store table {self._storage_name()!r}')
226
- idx_info.idx.create_index(store_name, idx_info.val_col)
227
-
228
292
  def load_column(self, col: catalog.Column, exec_plan: ExecNode, abort_on_exc: bool) -> int:
229
293
  """Update store column of a computed column with values produced by an execution plan
230
294
 
@@ -234,7 +298,7 @@ class StoreBase:
234
298
  sql.exc.DBAPIError if there was a SQL error during execution
235
299
  excs.Error if on_error='abort' and there was an exception during row evaluation
236
300
  """
237
- assert col.tbl.id == self.tbl_version.id
301
+ assert col.get_tbl().id == self.tbl_version.id
238
302
  num_excs = 0
239
303
  num_rows = 0
240
304
  # create temp table to store output of exec_plan, with the same primary key as the store table
@@ -304,7 +368,7 @@ class StoreBase:
304
368
  exec_plan: ExecNode,
305
369
  v_min: int,
306
370
  show_progress: bool = True,
307
- rowids: Optional[Iterator[int]] = None,
371
+ rowids: Iterator[int] | None = None,
308
372
  abort_on_exc: bool = False,
309
373
  ) -> tuple[set[int], RowCountStats]:
310
374
  """Insert rows into the store table and update the catalog table's md
@@ -316,7 +380,7 @@ class StoreBase:
316
380
  num_excs = 0
317
381
  num_rows = 0
318
382
  cols_with_excs: set[int] = set()
319
- progress_bar: Optional[tqdm] = None # create this only after we started executing
383
+ progress_bar: tqdm | None = None # create this only after we started executing
320
384
  row_builder = exec_plan.row_builder
321
385
 
322
386
  store_col_names = row_builder.store_column_names()
@@ -389,7 +453,7 @@ class StoreBase:
389
453
  # stmt_text = f'INSERT INTO {self.sa_tbl.name} ({col_names_str}) VALUES ({placeholders_str})'
390
454
  # conn.exec_driver_sql(stmt_text, table_rows)
391
455
 
392
- def _versions_clause(self, versions: list[Optional[int]], match_on_vmin: bool) -> sql.ColumnElement[bool]:
456
+ def _versions_clause(self, versions: list[int | None], match_on_vmin: bool) -> sql.ColumnElement[bool]:
393
457
  """Return filter for base versions"""
394
458
  v = versions[0]
395
459
  if v is None:
@@ -407,9 +471,9 @@ class StoreBase:
407
471
  def delete_rows(
408
472
  self,
409
473
  current_version: int,
410
- base_versions: list[Optional[int]],
474
+ base_versions: list[int | None],
411
475
  match_on_vmin: bool,
412
- where_clause: Optional[sql.ColumnElement[bool]],
476
+ where_clause: sql.ColumnElement[bool] | None,
413
477
  ) -> int:
414
478
  """Mark rows as deleted that are live and were created prior to current_version.
415
479
  Also: populate the undo columns
@@ -535,7 +599,7 @@ class StoreComponentView(StoreView):
535
599
  self.rowid_cols.append(self.pos_col)
536
600
  return self.rowid_cols
537
601
 
538
- def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
602
+ def create_sa_tbl(self, tbl_version: catalog.TableVersion | None = None) -> None:
539
603
  if tbl_version is None:
540
604
  tbl_version = self.tbl_version.get()
541
605
  super().create_sa_tbl(tbl_version)
pixeltable/type_system.py CHANGED
@@ -10,7 +10,7 @@ import typing
10
10
  import urllib.parse
11
11
  import urllib.request
12
12
  from pathlib import Path
13
- from typing import Any, ClassVar, Iterable, Literal, Mapping, Optional, Sequence, Union
13
+ from typing import Any, ClassVar, Iterable, Literal, Mapping, Sequence, Union
14
14
 
15
15
  from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
16
16
 
@@ -51,11 +51,11 @@ class ColumnType:
51
51
  @classmethod
52
52
  def supertype(
53
53
  cls,
54
- type1: Optional['ColumnType.Type'],
55
- type2: Optional['ColumnType.Type'],
54
+ type1: 'ColumnType.Type' | None,
55
+ type2: 'ColumnType.Type' | None,
56
56
  # we need to pass this in because we can't easily append it as a class member
57
57
  common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
58
- ) -> Optional['ColumnType.Type']:
58
+ ) -> 'ColumnType.Type' | None:
59
59
  if type1 == type2:
60
60
  return type1
61
61
  t = common_supertypes.get((type1, type2))
@@ -188,7 +188,7 @@ class ColumnType:
188
188
  if as_schema:
189
189
  return base_str if self.nullable else f'Required[{base_str}]'
190
190
  else:
191
- return f'Optional[{base_str}]' if self.nullable else base_str
191
+ return f'{base_str} | None' if self.nullable else base_str
192
192
 
193
193
  def _to_base_str(self) -> str:
194
194
  """
@@ -217,7 +217,7 @@ class ColumnType:
217
217
  # Default: just compare base types (this works for all types whose only parameter is nullable)
218
218
  return self._type == other._type
219
219
 
220
- def supertype(self, other: ColumnType) -> Optional[ColumnType]:
220
+ def supertype(self, other: ColumnType) -> ColumnType | None:
221
221
  if self == other:
222
222
  return self
223
223
  if self.matches(other):
@@ -237,7 +237,7 @@ class ColumnType:
237
237
  return None
238
238
 
239
239
  @classmethod
240
- def infer_literal_type(cls, val: Any, nullable: bool = False) -> Optional[ColumnType]:
240
+ def infer_literal_type(cls, val: Any, nullable: bool = False) -> ColumnType | None:
241
241
  if val is None:
242
242
  return InvalidType(nullable=True)
243
243
  if isinstance(val, str):
@@ -271,7 +271,7 @@ class ColumnType:
271
271
  return None
272
272
 
273
273
  @classmethod
274
- def infer_common_literal_type(cls, vals: Iterable[Any]) -> Optional[ColumnType]:
274
+ def infer_common_literal_type(cls, vals: Iterable[Any]) -> ColumnType | None:
275
275
  """
276
276
  Returns the most specific type that is a supertype of all literals in `vals`. If no such type
277
277
  exists, returns None.
@@ -279,7 +279,7 @@ class ColumnType:
279
279
  Args:
280
280
  vals: A collection of literals.
281
281
  """
282
- inferred_type: Optional[ColumnType] = None
282
+ inferred_type: ColumnType | None = None
283
283
  for val in vals:
284
284
  val_type = cls.infer_literal_type(val)
285
285
  if inferred_type is None:
@@ -299,7 +299,7 @@ class ColumnType:
299
299
  nullable_default: bool = False,
300
300
  allow_builtin_types: bool = True,
301
301
  infer_pydantic_json: bool = False,
302
- ) -> Optional[ColumnType]:
302
+ ) -> ColumnType | None:
303
303
  """
304
304
  Convert a Python type into a Pixeltable `ColumnType` instance.
305
305
 
@@ -317,9 +317,9 @@ class ColumnType:
317
317
  origin = typing.get_origin(t)
318
318
  type_args = typing.get_args(t)
319
319
  if origin in (typing.Union, types.UnionType):
320
- # Check if `t` has the form Optional[T].
320
+ # Check if `t` has the form T | None.
321
321
  if len(type_args) == 2 and type(None) in type_args:
322
- # `t` is a type of the form Optional[T] (equivalently, T | None or None | T).
322
+ # `t` is a type of the form T | None (equivalently, T | None or None | T).
323
323
  # We treat it as the underlying type but with nullable=True.
324
324
  underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
325
325
  underlying = cls.from_python_type(
@@ -338,7 +338,7 @@ class ColumnType:
338
338
  if isinstance(parameters, ColumnType):
339
339
  return parameters.copy(nullable=nullable_default)
340
340
  else:
341
- # It's something other than Optional[T], Required[T], or an explicitly annotated type.
341
+ # It's something other than T | None, Required[T], or an explicitly annotated type.
342
342
  if origin is not None:
343
343
  # Discard type parameters to ensure that parameterized types such as `list[T]`
344
344
  # are correctly mapped to Pixeltable types.
@@ -411,7 +411,7 @@ class ColumnType:
411
411
  raise excs.Error(f'Unknown type: {t}')
412
412
 
413
413
  @classmethod
414
- def from_json_schema(cls, schema: dict[str, Any]) -> Optional[ColumnType]:
414
+ def from_json_schema(cls, schema: dict[str, Any]) -> ColumnType | None:
415
415
  # We first express the JSON schema as a Python type, and then convert it to a Pixeltable type.
416
416
  # TODO: Is there a meaningful fallback if one of these operations fails? (Maybe another use case for a pxt Any
417
417
  # type?)
@@ -704,10 +704,10 @@ class DateType(ColumnType):
704
704
 
705
705
 
706
706
  class JsonType(ColumnType):
707
- json_schema: Optional[dict[str, Any]]
708
- __validator: Optional[jsonschema.protocols.Validator]
707
+ json_schema: dict[str, Any] | None
708
+ __validator: jsonschema.protocols.Validator | None
709
709
 
710
- def __init__(self, json_schema: Optional[dict[str, Any]] = None, nullable: bool = False):
710
+ def __init__(self, json_schema: dict[str, Any] | None = None, nullable: bool = False):
711
711
  super().__init__(self.Type.JSON, nullable=nullable)
712
712
  self.json_schema = json_schema
713
713
  if json_schema is None:
@@ -777,7 +777,7 @@ class JsonType(ColumnType):
777
777
  return val.model_dump()
778
778
  return val
779
779
 
780
- def supertype(self, other: ColumnType) -> Optional[JsonType]:
780
+ def supertype(self, other: ColumnType) -> JsonType | None:
781
781
  # Try using the (much faster) supertype logic in ColumnType first. That will work if, for example, the types
782
782
  # are identical except for nullability. If that doesn't work and both types are JsonType, then we will need to
783
783
  # merge their schemas.
@@ -799,7 +799,7 @@ class JsonType(ColumnType):
799
799
  )
800
800
 
801
801
  @classmethod
802
- def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
802
+ def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any] | None:
803
803
  # Defining a general type hierarchy over all JSON schemas would be a challenging problem. In order to keep
804
804
  # things manageable, we only define a hierarchy among "conforming" schemas, which provides enough generality
805
805
  # for the most important use cases (unions for type inference, validation of inline exprs). A schema is
@@ -859,7 +859,7 @@ class JsonType(ColumnType):
859
859
  return {} # Unresolvable type conflict; the supertype is an unrestricted JsonType.
860
860
 
861
861
  @classmethod
862
- def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
862
+ def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any] | None:
863
863
  a, a_nullable = cls.__unpack_null_from_schema(a)
864
864
  b, b_nullable = cls.__unpack_null_from_schema(b)
865
865
 
@@ -888,15 +888,12 @@ class JsonType(ColumnType):
888
888
 
889
889
 
890
890
  class ArrayType(ColumnType):
891
- shape: Optional[tuple[Optional[int], ...]]
892
- pxt_dtype: Optional[ColumnType]
893
- dtype: Optional[ColumnType.Type]
891
+ shape: tuple[int | None, ...] | None
892
+ pxt_dtype: ColumnType | None
893
+ dtype: ColumnType.Type | None
894
894
 
895
895
  def __init__(
896
- self,
897
- shape: Optional[tuple[Optional[int], ...]] = None,
898
- dtype: Optional[ColumnType] = None,
899
- nullable: bool = False,
896
+ self, shape: tuple[int | None, ...] | None = None, dtype: ColumnType | None = None, nullable: bool = False
900
897
  ):
901
898
  super().__init__(self.Type.ARRAY, nullable=nullable)
902
899
  assert shape is None or dtype is not None, (shape, dtype) # cannot specify a shape without a dtype
@@ -921,7 +918,7 @@ class ArrayType(ColumnType):
921
918
  def __hash__(self) -> int:
922
919
  return hash((self._type, self.nullable, self.shape, self.dtype))
923
920
 
924
- def supertype(self, other: ColumnType) -> Optional[ArrayType]:
921
+ def supertype(self, other: ColumnType) -> ArrayType | None:
925
922
  basic_supertype = super().supertype(other)
926
923
  if basic_supertype is not None:
927
924
  assert isinstance(basic_supertype, ArrayType)
@@ -934,7 +931,7 @@ class ArrayType(ColumnType):
934
931
  if super_dtype is None:
935
932
  # if the dtypes are incompatible, then the supertype is a fully general array
936
933
  return ArrayType(nullable=(self.nullable or other.nullable))
937
- super_shape: Optional[tuple[Optional[int], ...]]
934
+ super_shape: tuple[int | None, ...] | None
938
935
  if self.shape is None or other.shape is None or len(self.shape) != len(other.shape):
939
936
  super_shape = None
940
937
  else:
@@ -965,7 +962,7 @@ class ArrayType(ColumnType):
965
962
  return cls(shape, dtype, nullable=d['nullable'])
966
963
 
967
964
  @classmethod
968
- def from_np_dtype(cls, dtype: np.dtype, nullable: bool) -> Optional[ColumnType]:
965
+ def from_np_dtype(cls, dtype: np.dtype, nullable: bool) -> ColumnType | None:
969
966
  """
970
967
  Return pixeltable type corresponding to a given simple numpy dtype
971
968
  """
@@ -994,10 +991,10 @@ class ArrayType(ColumnType):
994
991
  return None
995
992
 
996
993
  @classmethod
997
- def from_literal(cls, val: np.ndarray, nullable: bool = False) -> Optional[ArrayType]:
994
+ def from_literal(cls, val: np.ndarray, nullable: bool = False) -> ArrayType | None:
998
995
  # determine our dtype
999
996
  assert isinstance(val, np.ndarray)
1000
- pxttype: Optional[ColumnType] = cls.from_np_dtype(val.dtype, nullable)
997
+ pxttype: ColumnType | None = cls.from_np_dtype(val.dtype, nullable)
1001
998
  if pxttype is None:
1002
999
  return None
1003
1000
  return cls(val.shape, dtype=pxttype, nullable=nullable)
@@ -1060,7 +1057,7 @@ class ArrayType(ColumnType):
1060
1057
  def to_sa_type(cls) -> sql.types.TypeEngine:
1061
1058
  return sql.LargeBinary()
1062
1059
 
1063
- def numpy_dtype(self) -> Optional[np.dtype]:
1060
+ def numpy_dtype(self) -> np.dtype | None:
1064
1061
  if self.dtype is None:
1065
1062
  return None
1066
1063
  if self.dtype == self.Type.INT:
@@ -1077,10 +1074,10 @@ class ArrayType(ColumnType):
1077
1074
  class ImageType(ColumnType):
1078
1075
  def __init__(
1079
1076
  self,
1080
- width: Optional[int] = None,
1081
- height: Optional[int] = None,
1082
- size: Optional[tuple[int, int]] = None,
1083
- mode: Optional[str] = None,
1077
+ width: int | None = None,
1078
+ height: int | None = None,
1079
+ size: tuple[int, int] | None = None,
1080
+ mode: str | None = None,
1084
1081
  nullable: bool = False,
1085
1082
  ):
1086
1083
  # TODO: does it make sense to specify only width or height?
@@ -1121,7 +1118,7 @@ class ImageType(ColumnType):
1121
1118
  def __hash__(self) -> int:
1122
1119
  return hash((self._type, self.nullable, self.size, self.mode))
1123
1120
 
1124
- def supertype(self, other: ColumnType) -> Optional[ImageType]:
1121
+ def supertype(self, other: ColumnType) -> ImageType | None:
1125
1122
  basic_supertype = super().supertype(other)
1126
1123
  if basic_supertype is not None:
1127
1124
  assert isinstance(basic_supertype, ImageType)
@@ -1136,7 +1133,7 @@ class ImageType(ColumnType):
1136
1133
  return ImageType(width=width, height=height, mode=mode, nullable=(self.nullable or other.nullable))
1137
1134
 
1138
1135
  @property
1139
- def size(self) -> Optional[tuple[int, int]]:
1136
+ def size(self) -> tuple[int, int] | None:
1140
1137
  if self.width is None or self.height is None:
1141
1138
  return None
1142
1139
  return (self.width, self.height)
@@ -1255,7 +1252,7 @@ class DocumentType(ColumnType):
1255
1252
  TXT = 4
1256
1253
 
1257
1254
  @classmethod
1258
- def from_extension(cls, ext: str) -> Optional['DocumentType.DocumentFormat']:
1255
+ def from_extension(cls, ext: str) -> 'DocumentType.DocumentFormat' | None:
1259
1256
  if ext in ('.htm', '.html'):
1260
1257
  return cls.HTML
1261
1258
  if ext == '.md':
@@ -1268,7 +1265,7 @@ class DocumentType(ColumnType):
1268
1265
  return cls.TXT
1269
1266
  return None
1270
1267
 
1271
- def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
1268
+ def __init__(self, nullable: bool = False, doc_formats: str | None = None):
1272
1269
  super().__init__(self.Type.DOCUMENT, nullable=nullable)
1273
1270
  self.doc_formats = doc_formats
1274
1271
  if doc_formats is not None:
@@ -1365,13 +1362,13 @@ class Array(np.ndarray, _PxtType):
1365
1362
  def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
1366
1363
  """
1367
1364
  `item` (the type subscript) must be a tuple with exactly two elements (in any order):
1368
- - A tuple of `Optional[int]`s, specifying the shape of the array
1365
+ - A tuple of `int | None`s, specifying the shape of the array
1369
1366
  - A type, specifying the dtype of the array
1370
1367
  Example: Array[(3, None, 2), pxt.Float]
1371
1368
  """
1372
1369
  params = item if isinstance(item, tuple) else (item,)
1373
- shape: Optional[tuple] = None
1374
- dtype: Optional[ColumnType] = None
1370
+ shape: tuple | None = None
1371
+ dtype: ColumnType | None = None
1375
1372
  if not any(isinstance(param, (type, _AnnotatedAlias)) for param in params):
1376
1373
  raise TypeError('Array type parameter must include a dtype.')
1377
1374
  for param in params:
@@ -1411,8 +1408,8 @@ class Image(PIL.Image.Image, _PxtType):
1411
1408
  else:
1412
1409
  # Not a tuple (single arg)
1413
1410
  params = (item,)
1414
- size: Optional[tuple] = None
1415
- mode: Optional[str] = None
1411
+ size: tuple | None = None
1412
+ mode: str | None = None
1416
1413
  for param in params:
1417
1414
  if isinstance(param, tuple):
1418
1415
  if (
@@ -2,7 +2,6 @@ import hashlib
2
2
  import urllib.parse
3
3
  import urllib.request
4
4
  from pathlib import Path
5
- from typing import Optional
6
5
 
7
6
 
8
7
  def print_perf_counter_delta(delta: float) -> str:
@@ -39,7 +38,7 @@ def sha256sum(path: Path | str) -> str:
39
38
  return h.hexdigest()
40
39
 
41
40
 
42
- def parse_local_file_path(file_or_url: str) -> Optional[Path]:
41
+ def parse_local_file_path(file_or_url: str) -> Path | None:
43
42
  """
44
43
  Parses a string that may be either a URL or a local file path.
45
44
 
pixeltable/utils/arrow.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import datetime
2
2
  import io
3
3
  import json
4
- from typing import TYPE_CHECKING, Any, Iterator, Optional, cast
4
+ from typing import TYPE_CHECKING, Any, Iterator, cast
5
5
 
6
6
  import numpy as np
7
7
  import PIL.Image
@@ -48,7 +48,7 @@ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
48
48
  }
49
49
 
50
50
 
51
- def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.ColumnType]:
51
+ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> ts.ColumnType | None:
52
52
  """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
53
53
  Returns None if no conversion is currently implemented.
54
54
  """
@@ -66,7 +66,7 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
66
66
  return None
67
67
 
68
68
 
69
- def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
69
+ def to_arrow_type(pixeltable_type: ts.ColumnType) -> pa.DataType | None:
70
70
  """Convert a pixeltable DataType to a pyarrow datatype if one is defined.
71
71
  Returns None if no conversion is currently implemented.
72
72
  """
@@ -240,7 +240,7 @@ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
240
240
 
241
241
 
242
242
  def iter_tuples2(
243
- batch: pa.Table | pa.RecordBatch, col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
243
+ batch: pa.Table | pa.RecordBatch, col_mapping: dict[str, str] | None, schema: dict[str, ts.ColumnType]
244
244
  ) -> Iterator[dict[str, Any]]:
245
245
  """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
246
246
  pydict = to_pydict(batch)
pixeltable/utils/av.py CHANGED
@@ -5,6 +5,14 @@ import av.stream
5
5
 
6
6
  from pixeltable.env import Env
7
7
 
8
+ # format -> (codec, extension)
9
+ AUDIO_FORMATS: dict[str, tuple[str, str]] = {
10
+ 'wav': ('pcm_s16le', 'wav'),
11
+ 'mp3': ('libmp3lame', 'mp3'),
12
+ 'flac': ('flac', 'flac'),
13
+ 'mp4': ('aac', 'm4a'),
14
+ }
15
+
8
16
 
9
17
  def get_metadata(path: str) -> dict:
10
18
  with av.open(path) as container: