pixeltable 0.4.18__py3-none-any.whl → 0.4.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (152) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/catalog.py +119 -100
  4. pixeltable/catalog/column.py +104 -115
  5. pixeltable/catalog/globals.py +1 -2
  6. pixeltable/catalog/insertable_table.py +44 -49
  7. pixeltable/catalog/path.py +3 -4
  8. pixeltable/catalog/schema_object.py +4 -4
  9. pixeltable/catalog/table.py +118 -122
  10. pixeltable/catalog/table_metadata.py +6 -6
  11. pixeltable/catalog/table_version.py +322 -257
  12. pixeltable/catalog/table_version_handle.py +4 -4
  13. pixeltable/catalog/table_version_path.py +9 -10
  14. pixeltable/catalog/tbl_ops.py +9 -3
  15. pixeltable/catalog/view.py +34 -28
  16. pixeltable/config.py +14 -10
  17. pixeltable/dataframe.py +68 -77
  18. pixeltable/env.py +74 -64
  19. pixeltable/exec/aggregation_node.py +6 -6
  20. pixeltable/exec/cache_prefetch_node.py +10 -10
  21. pixeltable/exec/data_row_batch.py +3 -3
  22. pixeltable/exec/exec_context.py +4 -5
  23. pixeltable/exec/exec_node.py +5 -5
  24. pixeltable/exec/expr_eval/evaluators.py +6 -6
  25. pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
  26. pixeltable/exec/expr_eval/globals.py +6 -6
  27. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  28. pixeltable/exec/expr_eval/schedulers.py +11 -11
  29. pixeltable/exec/in_memory_data_node.py +2 -2
  30. pixeltable/exec/object_store_save_node.py +14 -17
  31. pixeltable/exec/sql_node.py +25 -25
  32. pixeltable/exprs/arithmetic_expr.py +4 -4
  33. pixeltable/exprs/array_slice.py +2 -2
  34. pixeltable/exprs/column_property_ref.py +3 -3
  35. pixeltable/exprs/column_ref.py +61 -74
  36. pixeltable/exprs/comparison.py +5 -5
  37. pixeltable/exprs/compound_predicate.py +3 -3
  38. pixeltable/exprs/data_row.py +12 -12
  39. pixeltable/exprs/expr.py +41 -31
  40. pixeltable/exprs/expr_dict.py +3 -3
  41. pixeltable/exprs/expr_set.py +3 -3
  42. pixeltable/exprs/function_call.py +14 -14
  43. pixeltable/exprs/in_predicate.py +4 -4
  44. pixeltable/exprs/inline_expr.py +8 -8
  45. pixeltable/exprs/is_null.py +1 -3
  46. pixeltable/exprs/json_mapper.py +8 -8
  47. pixeltable/exprs/json_path.py +6 -6
  48. pixeltable/exprs/literal.py +5 -5
  49. pixeltable/exprs/method_ref.py +2 -2
  50. pixeltable/exprs/object_ref.py +2 -2
  51. pixeltable/exprs/row_builder.py +14 -14
  52. pixeltable/exprs/rowid_ref.py +8 -8
  53. pixeltable/exprs/similarity_expr.py +50 -25
  54. pixeltable/exprs/sql_element_cache.py +4 -4
  55. pixeltable/exprs/string_op.py +2 -2
  56. pixeltable/exprs/type_cast.py +3 -5
  57. pixeltable/func/aggregate_function.py +8 -8
  58. pixeltable/func/callable_function.py +9 -9
  59. pixeltable/func/expr_template_function.py +3 -3
  60. pixeltable/func/function.py +15 -17
  61. pixeltable/func/function_registry.py +6 -7
  62. pixeltable/func/globals.py +2 -3
  63. pixeltable/func/mcp.py +2 -2
  64. pixeltable/func/query_template_function.py +16 -16
  65. pixeltable/func/signature.py +14 -14
  66. pixeltable/func/tools.py +11 -11
  67. pixeltable/func/udf.py +16 -18
  68. pixeltable/functions/__init__.py +1 -0
  69. pixeltable/functions/anthropic.py +7 -7
  70. pixeltable/functions/audio.py +76 -0
  71. pixeltable/functions/bedrock.py +6 -6
  72. pixeltable/functions/deepseek.py +4 -4
  73. pixeltable/functions/fireworks.py +2 -2
  74. pixeltable/functions/gemini.py +6 -6
  75. pixeltable/functions/globals.py +12 -12
  76. pixeltable/functions/groq.py +4 -4
  77. pixeltable/functions/huggingface.py +18 -20
  78. pixeltable/functions/image.py +7 -10
  79. pixeltable/functions/llama_cpp.py +7 -7
  80. pixeltable/functions/math.py +2 -3
  81. pixeltable/functions/mistralai.py +3 -3
  82. pixeltable/functions/ollama.py +9 -9
  83. pixeltable/functions/openai.py +21 -21
  84. pixeltable/functions/openrouter.py +7 -7
  85. pixeltable/functions/string.py +21 -28
  86. pixeltable/functions/timestamp.py +7 -8
  87. pixeltable/functions/together.py +4 -6
  88. pixeltable/functions/twelvelabs.py +92 -0
  89. pixeltable/functions/video.py +2 -24
  90. pixeltable/functions/vision.py +6 -6
  91. pixeltable/functions/whisper.py +7 -7
  92. pixeltable/functions/whisperx.py +16 -16
  93. pixeltable/globals.py +52 -36
  94. pixeltable/index/base.py +12 -8
  95. pixeltable/index/btree.py +19 -22
  96. pixeltable/index/embedding_index.py +30 -39
  97. pixeltable/io/datarows.py +3 -3
  98. pixeltable/io/external_store.py +13 -16
  99. pixeltable/io/fiftyone.py +5 -5
  100. pixeltable/io/globals.py +5 -5
  101. pixeltable/io/hf_datasets.py +4 -4
  102. pixeltable/io/label_studio.py +12 -12
  103. pixeltable/io/pandas.py +6 -6
  104. pixeltable/io/parquet.py +2 -2
  105. pixeltable/io/table_data_conduit.py +12 -12
  106. pixeltable/io/utils.py +2 -2
  107. pixeltable/iterators/audio.py +2 -2
  108. pixeltable/iterators/video.py +8 -13
  109. pixeltable/metadata/converters/convert_18.py +2 -2
  110. pixeltable/metadata/converters/convert_19.py +2 -2
  111. pixeltable/metadata/converters/convert_20.py +2 -2
  112. pixeltable/metadata/converters/convert_21.py +2 -2
  113. pixeltable/metadata/converters/convert_22.py +2 -2
  114. pixeltable/metadata/converters/convert_24.py +2 -2
  115. pixeltable/metadata/converters/convert_25.py +2 -2
  116. pixeltable/metadata/converters/convert_26.py +2 -2
  117. pixeltable/metadata/converters/convert_29.py +4 -4
  118. pixeltable/metadata/converters/convert_34.py +2 -2
  119. pixeltable/metadata/converters/convert_36.py +2 -2
  120. pixeltable/metadata/converters/convert_38.py +2 -2
  121. pixeltable/metadata/converters/convert_39.py +1 -2
  122. pixeltable/metadata/converters/util.py +11 -13
  123. pixeltable/metadata/schema.py +22 -21
  124. pixeltable/metadata/utils.py +2 -6
  125. pixeltable/mypy/mypy_plugin.py +5 -5
  126. pixeltable/plan.py +30 -28
  127. pixeltable/share/packager.py +7 -7
  128. pixeltable/share/publish.py +3 -3
  129. pixeltable/store.py +125 -61
  130. pixeltable/type_system.py +43 -46
  131. pixeltable/utils/__init__.py +1 -2
  132. pixeltable/utils/arrow.py +4 -4
  133. pixeltable/utils/av.py +8 -0
  134. pixeltable/utils/azure_store.py +305 -0
  135. pixeltable/utils/code.py +1 -2
  136. pixeltable/utils/dbms.py +15 -19
  137. pixeltable/utils/description_helper.py +2 -3
  138. pixeltable/utils/documents.py +5 -6
  139. pixeltable/utils/exception_handler.py +2 -2
  140. pixeltable/utils/filecache.py +5 -5
  141. pixeltable/utils/formatter.py +4 -6
  142. pixeltable/utils/gcs_store.py +9 -9
  143. pixeltable/utils/local_store.py +17 -17
  144. pixeltable/utils/object_stores.py +59 -43
  145. pixeltable/utils/s3_store.py +35 -30
  146. {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/METADATA +1 -1
  147. pixeltable-0.4.19.dist-info/RECORD +213 -0
  148. pixeltable/__version__.py +0 -3
  149. pixeltable-0.4.18.dist-info/RECORD +0 -211
  150. {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
  151. {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
  152. {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
pixeltable/globals.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import logging
4
4
  import os
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple, Optional, Union
6
+ from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple, Union
7
7
 
8
8
  import pandas as pd
9
9
  import pydantic
@@ -14,6 +14,7 @@ from pixeltable.catalog import Catalog, TableVersionPath
14
14
  from pixeltable.catalog.insertable_table import OnErrorParameter
15
15
  from pixeltable.config import Config
16
16
  from pixeltable.env import Env
17
+ from pixeltable.io.table_data_conduit import DFTableDataConduit, TableDataConduit
17
18
  from pixeltable.iterators import ComponentIterator
18
19
 
19
20
  if TYPE_CHECKING:
@@ -36,7 +37,7 @@ if TYPE_CHECKING:
36
37
  _logger = logging.getLogger('pixeltable')
37
38
 
38
39
 
39
- def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
40
+ def init(config_overrides: dict[str, Any] | None = None) -> None:
40
41
  """Initializes the Pixeltable environment."""
41
42
  if config_overrides is None:
42
43
  config_overrides = {}
@@ -46,18 +47,19 @@ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
46
47
 
47
48
  def create_table(
48
49
  path: str,
49
- schema: Optional[dict[str, Any]] = None,
50
+ schema: dict[str, Any] | None = None,
50
51
  *,
51
- source: Optional[TableDataSource] = None,
52
- source_format: Optional[Literal['csv', 'excel', 'parquet', 'json']] = None,
53
- schema_overrides: Optional[dict[str, Any]] = None,
52
+ source: TableDataSource | None = None,
53
+ source_format: Literal['csv', 'excel', 'parquet', 'json'] | None = None,
54
+ schema_overrides: dict[str, Any] | None = None,
55
+ create_default_idxs: bool = True,
54
56
  on_error: Literal['abort', 'ignore'] = 'abort',
55
57
  primary_key: str | list[str] | None = None,
56
58
  num_retained_versions: int = 10,
57
59
  comment: str = '',
58
60
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
59
61
  if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
60
- extra_args: Optional[dict[str, Any]] = None, # Additional arguments to data source provider
62
+ extra_args: dict[str, Any] | None = None, # Additional arguments to data source provider
61
63
  ) -> catalog.Table:
62
64
  """Create a new base table. Exactly one of `schema` or `source` must be provided.
63
65
 
@@ -77,6 +79,8 @@ def create_table(
77
79
  schema_overrides: Must be used in conjunction with a `source`.
78
80
  If specified, then columns in `schema_overrides` will be given the specified types.
79
81
  (Pixeltable will attempt to infer the types of any columns not specified.)
82
+ create_default_idxs: If True, creates a B-tree index on every scalar and media column that is not computed,
83
+ except for boolean columns.
80
84
  on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
81
85
  invalid media file (such as a corrupt image) for one of the inserted rows.
82
86
 
@@ -138,7 +142,7 @@ def create_table(
138
142
 
139
143
  >>> tbl = pxt.create_table('my_table', source='data.csv')
140
144
  """
141
- from pixeltable.io.table_data_conduit import DFTableDataConduit, UnkTableDataConduit
145
+ from pixeltable.io.table_data_conduit import UnkTableDataConduit
142
146
  from pixeltable.io.utils import normalize_primary_key_parameter
143
147
 
144
148
  if (schema is None) == (source is None):
@@ -150,11 +154,16 @@ def create_table(
150
154
  path_obj = catalog.Path.parse(path)
151
155
  if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
152
156
  media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
153
- primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
154
- table: catalog.Table = None
155
- tds = None
156
- data_source = None
157
+ primary_key: list[str] | None = normalize_primary_key_parameter(primary_key)
158
+ data_source: TableDataConduit | None = None
157
159
  if source is not None:
160
+ if isinstance(source, str) and source.strip().startswith('pxt://'):
161
+ raise excs.Error(
162
+ 'create_table(): Creating a table directly from a cloud URI is not supported.'
163
+ ' Please replicate the table locally first using `pxt.replicate()`:\n'
164
+ "replica_tbl = pxt.replicate('pxt://path/to/remote_table', 'local_replica_name')\n"
165
+ "pxt.create_table('new_table_name', source=replica_tbl)"
166
+ )
158
167
  tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
159
168
  tds.check_source_format()
160
169
  data_source = tds.specialize()
@@ -179,35 +188,43 @@ def create_table(
179
188
  'Unable to create a proper schema from supplied `source`. Please use appropriate `schema_overrides`.'
180
189
  )
181
190
 
182
- table, was_created = Catalog.get().create_table(
191
+ tbl, was_created = Catalog.get().create_table(
183
192
  path_obj,
184
193
  schema,
185
- data_source.pxt_df if isinstance(data_source, DFTableDataConduit) else None,
186
194
  if_exists=if_exists_,
187
195
  primary_key=primary_key,
188
196
  comment=comment,
189
197
  media_validation=media_validation_,
190
198
  num_retained_versions=num_retained_versions,
199
+ create_default_idxs=create_default_idxs,
191
200
  )
192
- if was_created and data_source is not None and not is_direct_df:
201
+
202
+ # TODO: combine data loading with table creation into a single transaction
203
+ if was_created:
193
204
  fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
194
- table.insert_table_data_source(data_source=data_source, fail_on_exception=fail_on_exception)
205
+ if isinstance(data_source, DFTableDataConduit):
206
+ df = data_source.pxt_df
207
+ with Catalog.get().begin_xact(tbl=tbl._tbl_version_path, for_write=True, lock_mutable_tree=True):
208
+ tbl._tbl_version.get().insert(None, df, fail_on_exception=fail_on_exception)
209
+ elif data_source is not None and not is_direct_df:
210
+ tbl.insert_table_data_source(data_source=data_source, fail_on_exception=fail_on_exception)
195
211
 
196
- return table
212
+ return tbl
197
213
 
198
214
 
199
215
  def create_view(
200
216
  path: str,
201
217
  base: catalog.Table | DataFrame,
202
218
  *,
203
- additional_columns: Optional[dict[str, Any]] = None,
219
+ additional_columns: dict[str, Any] | None = None,
204
220
  is_snapshot: bool = False,
205
- iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
221
+ create_default_idxs: bool = False,
222
+ iterator: tuple[type[ComponentIterator], dict[str, Any]] | None = None,
206
223
  num_retained_versions: int = 10,
207
224
  comment: str = '',
208
225
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
209
226
  if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
210
- ) -> Optional[catalog.Table]:
227
+ ) -> catalog.Table | None:
211
228
  """Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
212
229
 
213
230
  Args:
@@ -220,6 +237,8 @@ def create_view(
220
237
  [`create_table`][pixeltable.create_table].
221
238
  is_snapshot: Whether the view is a snapshot. Setting this to `True` is equivalent to calling
222
239
  [`create_snapshot`][pixeltable.create_snapshot].
240
+ create_default_idxs: Whether to create default indexes on the view's columns (the base's columns are excluded).
241
+ Cannot be `True` for snapshots.
223
242
  iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
224
243
  the base table.
225
244
  num_retained_versions: Number of versions of the view to retain.
@@ -267,9 +286,11 @@ def create_view(
267
286
  >>> tbl = pxt.get_table('my_table')
268
287
  ... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 100), if_exists='replace_force')
269
288
  """
289
+ if is_snapshot and create_default_idxs is True:
290
+ raise excs.Error('Cannot create default indexes on a snapshot')
270
291
  tbl_version_path: TableVersionPath
271
- select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]] = None
272
- where: Optional[exprs.Expr] = None
292
+ select_list: list[tuple[exprs.Expr, str | None]] | None = None
293
+ where: exprs.Expr | None = None
273
294
  if isinstance(base, catalog.Table):
274
295
  tbl_version_path = base._tbl_version_path
275
296
  sample_clause = None
@@ -297,7 +318,7 @@ def create_view(
297
318
  if col_name in [c.name for c in tbl_version_path.columns()]:
298
319
  raise excs.Error(
299
320
  f'Column {col_name!r} already exists in the base table '
300
- f'{tbl_version_path.get_column(col_name).tbl.name}.'
321
+ f'{tbl_version_path.get_column(col_name).get_tbl().name}.'
301
322
  )
302
323
 
303
324
  return Catalog.get().create_view(
@@ -308,6 +329,7 @@ def create_view(
308
329
  sample_clause=sample_clause,
309
330
  additional_columns=additional_columns,
310
331
  is_snapshot=is_snapshot,
332
+ create_default_idxs=create_default_idxs,
311
333
  iterator=iterator,
312
334
  num_retained_versions=num_retained_versions,
313
335
  comment=comment,
@@ -320,13 +342,13 @@ def create_snapshot(
320
342
  path_str: str,
321
343
  base: catalog.Table | DataFrame,
322
344
  *,
323
- additional_columns: Optional[dict[str, Any]] = None,
324
- iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
345
+ additional_columns: dict[str, Any] | None = None,
346
+ iterator: tuple[type[ComponentIterator], dict[str, Any]] | None = None,
325
347
  num_retained_versions: int = 10,
326
348
  comment: str = '',
327
349
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
328
350
  if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
329
- ) -> Optional[catalog.Table]:
351
+ ) -> catalog.Table | None:
330
352
  """Create a snapshot of an existing table object (which itself can be a view or a snapshot or a base table).
331
353
 
332
354
  Args:
@@ -680,7 +702,7 @@ def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths:
680
702
 
681
703
  def create_dir(
682
704
  path: str, *, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
683
- ) -> Optional[catalog.Dir]:
705
+ ) -> catalog.Dir | None:
684
706
  """Create a directory.
685
707
 
686
708
  Args:
@@ -835,9 +857,7 @@ def ls(path: str = '') -> pd.DataFrame:
835
857
 
836
858
 
837
859
  def _extract_paths(
838
- dir_entries: dict[str, Catalog.DirEntry],
839
- parent: catalog.Path,
840
- entry_type: Optional[type[catalog.SchemaObject]] = None,
860
+ dir_entries: dict[str, Catalog.DirEntry], parent: catalog.Path, entry_type: type[catalog.SchemaObject] | None = None
841
861
  ) -> list[catalog.Path]:
842
862
  """Convert nested dir_entries structure to a flattened list of paths."""
843
863
  matches: list[str]
@@ -947,7 +967,7 @@ def tools(*args: func.Function | func.tools.Tool) -> func.tools.Tools:
947
967
  return func.tools.Tools(tools=[arg if isinstance(arg, func.tools.Tool) else tool(arg) for arg in args])
948
968
 
949
969
 
950
- def tool(fn: func.Function, name: Optional[str] = None, description: Optional[str] = None) -> func.tools.Tool:
970
+ def tool(fn: func.Function, name: str | None = None, description: str | None = None) -> func.tools.Tool:
951
971
  """
952
972
  Specifies a Pixeltable UDF to be used as an LLM tool with customizable metadata. See the documentation for
953
973
  [pxt.tools()][pixeltable.tools] for more details.
@@ -968,11 +988,7 @@ def tool(fn: func.Function, name: Optional[str] = None, description: Optional[st
968
988
 
969
989
 
970
990
  def configure_logging(
971
- *,
972
- to_stdout: Optional[bool] = None,
973
- level: Optional[int] = None,
974
- add: Optional[str] = None,
975
- remove: Optional[str] = None,
991
+ *, to_stdout: bool | None = None, level: int | None = None, add: str | None = None, remove: str | None = None
976
992
  ) -> None:
977
993
  """Configure logging.
978
994
 
pixeltable/index/base.py CHANGED
@@ -5,7 +5,9 @@ from typing import Any
5
5
 
6
6
  import sqlalchemy as sql
7
7
 
8
- from pixeltable import catalog, exprs
8
+ import pixeltable.catalog as catalog
9
+ import pixeltable.exprs as exprs
10
+ import pixeltable.type_system as ts
9
11
 
10
12
 
11
13
  class IndexBase(abc.ABC):
@@ -18,12 +20,14 @@ class IndexBase(abc.ABC):
18
20
  """
19
21
 
20
22
  @abc.abstractmethod
21
- def __init__(self, c: catalog.Column, **kwargs: Any):
23
+ def __init__(self, **kwargs: Any):
22
24
  pass
23
25
 
24
26
  @abc.abstractmethod
25
- def index_value_expr(self) -> exprs.Expr:
26
- """Return expression that computes the value that goes into the index"""
27
+ def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
28
+ """
29
+ Validates that the index can be created on column c and returns an expression that computes the index value.
30
+ """
27
31
  pass
28
32
 
29
33
  @abc.abstractmethod
@@ -32,13 +36,13 @@ class IndexBase(abc.ABC):
32
36
  pass
33
37
 
34
38
  @abc.abstractmethod
35
- def index_sa_type(self) -> sql.types.TypeEngine:
39
+ def get_index_sa_type(self, value_col_type: ts.ColumnType) -> sql.types.TypeEngine:
36
40
  """Return the sqlalchemy type of the index value column"""
37
41
  pass
38
42
 
39
43
  @abc.abstractmethod
40
- def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
41
- """Create the index on the index value column"""
44
+ def sa_index(self, index_name: str, index_value_col: catalog.Column) -> sql.Index:
45
+ """Return a sqlalchemy Index instance"""
42
46
  pass
43
47
 
44
48
  @abc.abstractmethod
@@ -57,5 +61,5 @@ class IndexBase(abc.ABC):
57
61
 
58
62
  @classmethod
59
63
  @abc.abstractmethod
60
- def from_dict(cls, c: catalog.Column, d: dict) -> IndexBase:
64
+ def from_dict(cls, d: dict) -> IndexBase:
61
65
  pass
pixeltable/index/btree.py CHANGED
@@ -1,18 +1,18 @@
1
- from typing import TYPE_CHECKING, Optional
1
+ from typing import TYPE_CHECKING
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
5
5
  # TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
6
6
  # import pixeltable.catalog as catalog
7
7
  import pixeltable.exceptions as excs
8
- from pixeltable import catalog, exprs
9
- from pixeltable.env import Env
8
+ import pixeltable.exprs as exprs
9
+ import pixeltable.type_system as ts
10
10
  from pixeltable.func.udf import udf
11
11
 
12
12
  from .base import IndexBase
13
13
 
14
14
  if TYPE_CHECKING:
15
- import pixeltable.exprs
15
+ import pixeltable.catalog as catalog
16
16
 
17
17
 
18
18
  class BtreeIndex(IndexBase):
@@ -22,42 +22,39 @@ class BtreeIndex(IndexBase):
22
22
 
23
23
  MAX_STRING_LEN = 256
24
24
 
25
- value_expr: 'pixeltable.exprs.Expr'
26
-
27
25
  @staticmethod
28
26
  @udf
29
- def str_filter(s: Optional[str]) -> Optional[str]:
27
+ def str_filter(s: str | None) -> str | None:
30
28
  if s is None:
31
29
  return None
32
30
  return s[: BtreeIndex.MAX_STRING_LEN]
33
31
 
34
- def __init__(self, c: 'catalog.Column'):
32
+ def __init__(self) -> None:
33
+ pass
34
+
35
+ def create_value_expr(self, c: 'catalog.Column') -> 'exprs.Expr':
35
36
  if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
36
37
  raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
38
+ value_expr: exprs.Expr
37
39
  if c.col_type.is_media_type():
38
40
  # an index on a media column is an index on the file url
39
41
  # no validation for media columns: we're only interested in the string value
40
- self.value_expr = exprs.ColumnRef(c, perform_validation=False)
42
+ value_expr = exprs.ColumnRef(c, perform_validation=False)
41
43
  else:
42
- self.value_expr = (
44
+ value_expr = (
43
45
  BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
44
46
  )
45
-
46
- def index_value_expr(self) -> 'exprs.Expr':
47
- return self.value_expr
47
+ return value_expr
48
48
 
49
49
  def records_value_errors(self) -> bool:
50
50
  return False
51
51
 
52
- def index_sa_type(self) -> sql.types.TypeEngine:
52
+ def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
53
53
  """Return the sqlalchemy type of the index value column"""
54
- return self.value_expr.col_type.to_sa_type()
54
+ return val_col_type.to_sa_type()
55
55
 
56
- def create_index(self, index_name: str, index_value_col: 'catalog.Column') -> None:
57
- """Create the index on the index value column"""
58
- idx = sql.Index(index_name, index_value_col.sa_col, postgresql_using='btree')
59
- conn = Env.get().conn
60
- idx.create(bind=conn)
56
+ def sa_index(self, store_index_name: str, index_value_col: 'catalog.Column') -> sql.Index:
57
+ return sql.Index(store_index_name, index_value_col.sa_col, postgresql_using='btree')
61
58
 
62
59
  def drop_index(self, index_name: str, index_value_col: 'catalog.Column') -> None:
63
60
  """Drop the index on the index value column"""
@@ -72,5 +69,5 @@ class BtreeIndex(IndexBase):
72
69
  return {}
73
70
 
74
71
  @classmethod
75
- def from_dict(cls, c: 'catalog.Column', d: dict) -> 'BtreeIndex':
76
- return cls(c)
72
+ def from_dict(cls, d: dict) -> 'BtreeIndex':
73
+ return cls()
@@ -1,16 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import enum
4
- from typing import Any, ClassVar, Optional
4
+ from typing import Any, ClassVar
5
5
 
6
6
  import numpy as np
7
7
  import pgvector.sqlalchemy # type: ignore[import-untyped]
8
8
  import PIL.Image
9
9
  import sqlalchemy as sql
10
10
 
11
+ import pixeltable.catalog as catalog
11
12
  import pixeltable.exceptions as excs
13
+ import pixeltable.exprs as exprs
14
+ import pixeltable.func as func
12
15
  import pixeltable.type_system as ts
13
- from pixeltable import catalog, exprs, func
14
16
  from pixeltable.env import Env
15
17
 
16
18
  from .base import IndexBase
@@ -39,28 +41,23 @@ class EmbeddingIndex(IndexBase):
39
41
  }
40
42
 
41
43
  metric: Metric
42
- value_expr: exprs.FunctionCall
43
- string_embed: Optional[func.Function]
44
- image_embed: Optional[func.Function]
44
+ string_embed: func.Function | None
45
+ image_embed: func.Function | None
45
46
  string_embed_signature_idx: int
46
47
  image_embed_signature_idx: int
47
- index_col_type: pgvector.sqlalchemy.Vector
48
48
 
49
49
  def __init__(
50
50
  self,
51
- c: catalog.Column,
52
51
  metric: str,
53
- embed: Optional[func.Function] = None,
54
- string_embed: Optional[func.Function] = None,
55
- image_embed: Optional[func.Function] = None,
52
+ embed: func.Function | None = None,
53
+ string_embed: func.Function | None = None,
54
+ image_embed: func.Function | None = None,
56
55
  ):
57
56
  if embed is None and string_embed is None and image_embed is None:
58
57
  raise excs.Error('At least one of `embed`, `string_embed`, or `image_embed` must be specified')
59
58
  metric_names = [m.name.lower() for m in self.Metric]
60
59
  if metric.lower() not in metric_names:
61
60
  raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
62
- if not c.col_type.is_string_type() and not c.col_type.is_image_type():
63
- raise excs.Error('Embedding index requires string or image column')
64
61
 
65
62
  self.string_embed = None
66
63
  self.image_embed = None
@@ -102,47 +99,42 @@ class EmbeddingIndex(IndexBase):
102
99
  )
103
100
 
104
101
  # Now validate the return types of the embedding functions.
105
-
106
102
  if self.string_embed is not None:
107
103
  self._validate_embedding_fn(self.string_embed)
108
-
109
104
  if self.image_embed is not None:
110
105
  self._validate_embedding_fn(self.image_embed)
111
106
 
107
+ self.metric = self.Metric[metric.upper()]
108
+
109
+ def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
110
+ if not c.col_type.is_string_type() and not c.col_type.is_image_type():
111
+ raise excs.Error(
112
+ f'Embedding index requires string or image column, column {c.name!r} has type {c.col_type}'
113
+ )
112
114
  if c.col_type.is_string_type() and self.string_embed is None:
113
115
  raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
114
116
  if c.col_type.is_image_type() and self.image_embed is None:
115
117
  raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
116
118
 
117
- self.metric = self.Metric[metric.upper()]
118
- self.value_expr = (
119
+ return (
119
120
  self.string_embed(exprs.ColumnRef(c))
120
121
  if c.col_type.is_string_type()
121
122
  else self.image_embed(exprs.ColumnRef(c))
122
123
  )
123
- assert isinstance(self.value_expr.col_type, ts.ArrayType)
124
- vector_size = self.value_expr.col_type.shape[0]
125
- assert vector_size is not None
126
- self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
127
-
128
- def index_value_expr(self) -> exprs.Expr:
129
- """Return expression that computes the value that goes into the index"""
130
- return self.value_expr
131
124
 
132
125
  def records_value_errors(self) -> bool:
133
126
  return True
134
127
 
135
- def index_sa_type(self) -> sql.types.TypeEngine:
136
- """Return the sqlalchemy type of the index value column"""
137
- return self.index_col_type
128
+ def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
129
+ assert isinstance(val_col_type, ts.ArrayType) and val_col_type.shape is not None
130
+ vector_size = val_col_type.shape[0]
131
+ assert vector_size is not None
132
+ return pgvector.sqlalchemy.Vector(vector_size)
138
133
 
139
- def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
134
+ def sa_index(self, store_index_name: str, index_value_col: 'catalog.Column') -> sql.Index:
140
135
  """Create the index on the index value column"""
141
- Env.get().dbms.create_vector_index(
142
- index_name=index_name,
143
- index_value_sa_col=index_value_col.sa_col,
144
- conn=Env.get().conn,
145
- metric=self.PGVECTOR_OPS[self.metric],
136
+ return Env.get().dbms.sa_vector_index(
137
+ store_index_name, index_value_col.sa_col, metric=self.PGVECTOR_OPS[self.metric]
146
138
  )
147
139
 
148
140
  def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
@@ -153,6 +145,7 @@ class EmbeddingIndex(IndexBase):
153
145
  def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
154
146
  """Create a ColumnElement that represents '<val_column> <op> <item>'"""
155
147
  assert isinstance(item, (str, PIL.Image.Image))
148
+ embedding: np.ndarray
156
149
  if isinstance(item, str):
157
150
  assert self.string_embed is not None
158
151
  embedding = self.string_embed.exec([item], {})
@@ -171,7 +164,7 @@ class EmbeddingIndex(IndexBase):
171
164
  def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ColumnElement:
172
165
  """Create a ColumnElement that is used in an ORDER BY clause"""
173
166
  assert isinstance(item, (str, PIL.Image.Image))
174
- embedding: Optional[np.ndarray] = None
167
+ embedding: np.ndarray | None = None
175
168
  if isinstance(item, str):
176
169
  assert self.string_embed is not None
177
170
  embedding = self.string_embed.exec([item], {})
@@ -196,9 +189,7 @@ class EmbeddingIndex(IndexBase):
196
189
  return 'embedding'
197
190
 
198
191
  @classmethod
199
- def _resolve_embedding_fn(
200
- cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type
201
- ) -> Optional[func.Function]:
192
+ def _resolve_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> func.Function | None:
202
193
  """Find an overload resolution for `embed_fn` that matches the given type."""
203
194
  assert isinstance(embed_fn, func.Function)
204
195
  for resolved_fn in embed_fn._resolved_fns:
@@ -252,7 +243,7 @@ class EmbeddingIndex(IndexBase):
252
243
  }
253
244
 
254
245
  @classmethod
255
- def from_dict(cls, c: catalog.Column, d: dict) -> EmbeddingIndex:
246
+ def from_dict(cls, d: dict) -> EmbeddingIndex:
256
247
  string_embed = func.Function.from_dict(d['string_embed']) if d['string_embed'] is not None else None
257
248
  image_embed = func.Function.from_dict(d['image_embed']) if d['image_embed'] is not None else None
258
- return cls(c, metric=d['metric'], string_embed=string_embed, image_embed=image_embed)
249
+ return cls(metric=d['metric'], string_embed=string_embed, image_embed=image_embed)
pixeltable/io/datarows.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Iterable, Optional
3
+ from typing import Any, Iterable
4
4
 
5
5
  import pixeltable as pxt
6
6
  import pixeltable.type_system as ts
@@ -60,7 +60,7 @@ def import_rows(
60
60
  tbl_path: str,
61
61
  rows: list[dict[str, Any]],
62
62
  *,
63
- schema_overrides: Optional[dict[str, Any]] = None,
63
+ schema_overrides: dict[str, Any] | None = None,
64
64
  primary_key: str | list[str] | None = None,
65
65
  num_retained_versions: int = 10,
66
66
  comment: str = '',
@@ -104,7 +104,7 @@ def import_json(
104
104
  tbl_path: str,
105
105
  filepath_or_url: str,
106
106
  *,
107
- schema_overrides: Optional[dict[str, Any]] = None,
107
+ schema_overrides: dict[str, Any] | None = None,
108
108
  primary_key: str | list[str] | None = None,
109
109
  num_retained_versions: int = 10,
110
110
  comment: str = '',
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import abc
4
4
  import itertools
5
5
  import logging
6
- from typing import Any, Optional
6
+ from typing import Any
7
7
 
8
8
  import pixeltable.exceptions as excs
9
9
  import pixeltable.type_system as ts
@@ -68,10 +68,7 @@ class Project(ExternalStore, abc.ABC):
68
68
  stored_proxies: dict[ColumnHandle, ColumnHandle] # original col -> proxy col
69
69
 
70
70
  def __init__(
71
- self,
72
- name: str,
73
- col_mapping: dict[ColumnHandle, str],
74
- stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]],
71
+ self, name: str, col_mapping: dict[ColumnHandle, str], stored_proxies: dict[ColumnHandle, ColumnHandle] | None
75
72
  ):
76
73
  super().__init__(name)
77
74
  self._col_mapping = col_mapping
@@ -190,7 +187,7 @@ class Project(ExternalStore, abc.ABC):
190
187
  table: Table,
191
188
  export_cols: dict[str, ts.ColumnType],
192
189
  import_cols: dict[str, ts.ColumnType],
193
- col_mapping: Optional[dict[str, str]],
190
+ col_mapping: dict[str, str] | None,
194
191
  ) -> dict[ColumnHandle, str]:
195
192
  """
196
193
  Verifies that the specified `col_mapping` is valid. In particular, checks that:
@@ -217,19 +214,19 @@ class Project(ExternalStore, abc.ABC):
217
214
  if t_col not in t_cols:
218
215
  if is_user_specified_col_mapping:
219
216
  raise excs.Error(
220
- f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table._name}` '
217
+ f'Column name {t_col!r} appears as a key in `col_mapping`, but {table._display_str()} '
221
218
  'contains no such column.'
222
219
  )
223
220
  else:
224
221
  raise excs.Error(
225
- f'Column `{t_col}` does not exist in Table `{table._name}`. Either add a column `{t_col}`, '
222
+ f'Column {t_col!r} does not exist in {table._display_str()}. Either add a column {t_col!r}, '
226
223
  f'or specify a `col_mapping` to associate a different column with '
227
- f'the external field `{ext_col}`.'
224
+ f'the external field {ext_col!r}.'
228
225
  )
229
226
  if ext_col not in export_cols and ext_col not in import_cols:
230
227
  raise excs.Error(
231
- f'Column name `{ext_col}` appears as a value in `col_mapping`, but the external store '
232
- f'configuration has no column `{ext_col}`.'
228
+ f'Column name {ext_col!r} appears as a value in `col_mapping`, but the external store '
229
+ f'configuration has no column {ext_col!r}.'
233
230
  )
234
231
  col_ref = table[t_col]
235
232
  assert isinstance(col_ref, exprs.ColumnRef)
@@ -244,19 +241,19 @@ class Project(ExternalStore, abc.ABC):
244
241
  ext_col_type = export_cols[ext_col]
245
242
  if not ext_col_type.is_supertype_of(t_col_type, ignore_nullable=True):
246
243
  raise excs.Error(
247
- f'Column `{t_col}` cannot be exported to external column `{ext_col}` '
244
+ f'Column {t_col!r} cannot be exported to external column {ext_col!r} '
248
245
  f'(incompatible types; expecting `{ext_col_type}`)'
249
246
  )
250
247
  if ext_col in import_cols:
251
248
  # Validate that the external column can be assigned to the table column
252
249
  if table._tbl_version_path.get_column(t_col).is_computed:
253
250
  raise excs.Error(
254
- f'Column `{t_col}` is a computed column, which cannot be populated from an external column'
251
+ f'Column {t_col!r} is a computed column, which cannot be populated from an external column'
255
252
  )
256
253
  ext_col_type = import_cols[ext_col]
257
254
  if not t_col_type.is_supertype_of(ext_col_type, ignore_nullable=True):
258
255
  raise excs.Error(
259
- f'Column `{t_col}` cannot be imported from external column `{ext_col}` '
256
+ f'Column {t_col!r} cannot be imported from external column {ext_col!r} '
260
257
  f'(incompatible types; expecting `{ext_col_type}`)'
261
258
  )
262
259
  return resolved_col_mapping
@@ -271,7 +268,7 @@ class MockProject(Project):
271
268
  export_cols: dict[str, ts.ColumnType],
272
269
  import_cols: dict[str, ts.ColumnType],
273
270
  col_mapping: dict[ColumnHandle, str],
274
- stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]] = None,
271
+ stored_proxies: dict[ColumnHandle, ColumnHandle] | None = None,
275
272
  ):
276
273
  super().__init__(name, col_mapping, stored_proxies)
277
274
  self.export_cols = export_cols
@@ -285,7 +282,7 @@ class MockProject(Project):
285
282
  name: str,
286
283
  export_cols: dict[str, ts.ColumnType],
287
284
  import_cols: dict[str, ts.ColumnType],
288
- col_mapping: Optional[dict[str, str]] = None,
285
+ col_mapping: dict[str, str] | None = None,
289
286
  ) -> 'MockProject':
290
287
  col_mapping = cls.validate_columns(t, export_cols, import_cols, col_mapping)
291
288
  return cls(name, export_cols, import_cols, col_mapping)