pixeltable 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (99) hide show
  1. pixeltable/__init__.py +18 -9
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/column.py +31 -50
  4. pixeltable/catalog/insertable_table.py +7 -6
  5. pixeltable/catalog/table.py +171 -57
  6. pixeltable/catalog/table_version.py +417 -140
  7. pixeltable/catalog/table_version_path.py +2 -2
  8. pixeltable/dataframe.py +239 -121
  9. pixeltable/env.py +82 -16
  10. pixeltable/exec/__init__.py +2 -1
  11. pixeltable/exec/cache_prefetch_node.py +1 -1
  12. pixeltable/exec/data_row_batch.py +6 -7
  13. pixeltable/exec/expr_eval_node.py +28 -28
  14. pixeltable/exec/in_memory_data_node.py +11 -7
  15. pixeltable/exec/sql_scan_node.py +7 -6
  16. pixeltable/exprs/__init__.py +4 -3
  17. pixeltable/exprs/column_ref.py +9 -0
  18. pixeltable/exprs/comparison.py +3 -3
  19. pixeltable/exprs/data_row.py +5 -1
  20. pixeltable/exprs/expr.py +15 -7
  21. pixeltable/exprs/function_call.py +17 -15
  22. pixeltable/exprs/image_member_access.py +9 -28
  23. pixeltable/exprs/in_predicate.py +96 -0
  24. pixeltable/exprs/inline_array.py +13 -11
  25. pixeltable/exprs/inline_dict.py +15 -13
  26. pixeltable/exprs/literal.py +16 -4
  27. pixeltable/exprs/row_builder.py +15 -41
  28. pixeltable/exprs/similarity_expr.py +65 -0
  29. pixeltable/ext/__init__.py +5 -0
  30. pixeltable/ext/functions/yolox.py +92 -0
  31. pixeltable/func/__init__.py +0 -2
  32. pixeltable/func/aggregate_function.py +18 -15
  33. pixeltable/func/callable_function.py +57 -13
  34. pixeltable/func/expr_template_function.py +20 -3
  35. pixeltable/func/function.py +35 -4
  36. pixeltable/func/globals.py +24 -14
  37. pixeltable/func/signature.py +23 -27
  38. pixeltable/func/udf.py +13 -12
  39. pixeltable/functions/__init__.py +8 -8
  40. pixeltable/functions/eval.py +7 -8
  41. pixeltable/functions/huggingface.py +64 -17
  42. pixeltable/functions/openai.py +36 -3
  43. pixeltable/functions/pil/image.py +61 -64
  44. pixeltable/functions/together.py +21 -0
  45. pixeltable/functions/util.py +11 -0
  46. pixeltable/globals.py +425 -0
  47. pixeltable/index/__init__.py +2 -0
  48. pixeltable/index/base.py +51 -0
  49. pixeltable/index/embedding_index.py +168 -0
  50. pixeltable/io/__init__.py +3 -0
  51. pixeltable/{utils → io}/hf_datasets.py +48 -17
  52. pixeltable/io/pandas.py +148 -0
  53. pixeltable/{utils → io}/parquet.py +58 -33
  54. pixeltable/iterators/__init__.py +1 -1
  55. pixeltable/iterators/base.py +4 -0
  56. pixeltable/iterators/document.py +218 -97
  57. pixeltable/iterators/video.py +8 -9
  58. pixeltable/metadata/__init__.py +7 -3
  59. pixeltable/metadata/converters/convert_12.py +3 -0
  60. pixeltable/metadata/converters/convert_13.py +41 -0
  61. pixeltable/metadata/schema.py +45 -22
  62. pixeltable/plan.py +15 -51
  63. pixeltable/store.py +38 -41
  64. pixeltable/tool/create_test_db_dump.py +39 -4
  65. pixeltable/type_system.py +47 -96
  66. pixeltable/utils/documents.py +42 -12
  67. pixeltable/utils/http_server.py +70 -0
  68. {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/METADATA +14 -10
  69. pixeltable-0.2.6.dist-info/RECORD +119 -0
  70. {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/WHEEL +1 -1
  71. pixeltable/client.py +0 -604
  72. pixeltable/exprs/image_similarity_predicate.py +0 -58
  73. pixeltable/func/batched_function.py +0 -53
  74. pixeltable/tests/conftest.py +0 -177
  75. pixeltable/tests/functions/test_fireworks.py +0 -42
  76. pixeltable/tests/functions/test_functions.py +0 -60
  77. pixeltable/tests/functions/test_huggingface.py +0 -158
  78. pixeltable/tests/functions/test_openai.py +0 -152
  79. pixeltable/tests/functions/test_together.py +0 -111
  80. pixeltable/tests/test_audio.py +0 -65
  81. pixeltable/tests/test_catalog.py +0 -27
  82. pixeltable/tests/test_client.py +0 -21
  83. pixeltable/tests/test_component_view.py +0 -370
  84. pixeltable/tests/test_dataframe.py +0 -439
  85. pixeltable/tests/test_dirs.py +0 -107
  86. pixeltable/tests/test_document.py +0 -120
  87. pixeltable/tests/test_exprs.py +0 -805
  88. pixeltable/tests/test_function.py +0 -324
  89. pixeltable/tests/test_migration.py +0 -43
  90. pixeltable/tests/test_nos.py +0 -54
  91. pixeltable/tests/test_snapshot.py +0 -208
  92. pixeltable/tests/test_table.py +0 -1267
  93. pixeltable/tests/test_transactional_directory.py +0 -42
  94. pixeltable/tests/test_types.py +0 -22
  95. pixeltable/tests/test_video.py +0 -159
  96. pixeltable/tests/test_view.py +0 -530
  97. pixeltable/tests/utils.py +0 -408
  98. pixeltable-0.2.4.dist-info/RECORD +0 -132
  99. {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/LICENSE +0 -0
pixeltable/__init__.py CHANGED
@@ -1,18 +1,30 @@
1
1
  from .catalog import Column, Table, InsertableTable, View
2
- from .client import Client
3
2
  from .dataframe import DataFrame
4
3
  from .exceptions import Error, Error
5
4
  from .exprs import RELATIVE_PATH_ROOT
6
5
  from .func import Function, udf, uda, Aggregator, expr_udf
7
- from .type_system import \
8
- ColumnType, StringType, IntType, FloatType, BoolType, TimestampType, JsonType, ArrayType, ImageType, VideoType, \
9
- AudioType, DocumentType
6
+ from .globals import *
7
+ from .type_system import (
8
+ ColumnType,
9
+ StringType,
10
+ IntType,
11
+ FloatType,
12
+ BoolType,
13
+ TimestampType,
14
+ JsonType,
15
+ ArrayType,
16
+ ImageType,
17
+ VideoType,
18
+ AudioType,
19
+ DocumentType,
20
+ )
10
21
  from .utils.help import help
22
+
11
23
  # noinspection PyUnresolvedReferences
12
- from . import functions
24
+ from . import functions, io
25
+ from .__version__ import __version__, __version_tuple__
13
26
 
14
27
  __all__ = [
15
- 'Client',
16
28
  'DataFrame',
17
29
  'Column',
18
30
  'Table',
@@ -39,6 +51,3 @@ __all__ = [
39
51
  'uda',
40
52
  'expr_udf',
41
53
  ]
42
-
43
-
44
-
@@ -0,0 +1,3 @@
1
+ # These version placeholders will be replaced during build.
2
+ __version__ = "0.2.6"
3
+ __version_tuple__ = (0, 2, 6)
@@ -4,11 +4,9 @@ import logging
4
4
  from typing import Optional, Union, Callable, Set
5
5
 
6
6
  import sqlalchemy as sql
7
- from pgvector.sqlalchemy import Vector
8
7
 
9
- from pixeltable import exceptions as excs
10
- from pixeltable.metadata import schema
11
- from pixeltable.type_system import ColumnType, StringType
8
+ import pixeltable.exceptions as excs
9
+ import pixeltable.type_system as ts
12
10
  from .globals import is_valid_identifier
13
11
 
14
12
  _logger = logging.getLogger('pixeltable')
@@ -20,44 +18,38 @@ class Column:
20
18
  table/view.
21
19
  """
22
20
  def __init__(
23
- self, name: str, col_type: Optional[ColumnType] = None,
21
+ self, name: Optional[str], col_type: Optional[ts.ColumnType] = None,
24
22
  computed_with: Optional[Union['Expr', Callable]] = None,
25
- primary_key: bool = False, stored: Optional[bool] = None,
26
- indexed: bool = False,
27
- # these parameters aren't set by users
28
- col_id: Optional[int] = None):
23
+ is_pk: bool = False, stored: Optional[bool] = None,
24
+ col_id: Optional[int] = None, schema_version_add: Optional[int] = None,
25
+ schema_version_drop: Optional[int] = None, sa_col_type: Optional[sql.sqltypes.TypeEngine] = None
26
+ ):
29
27
  """Column constructor.
30
28
 
31
29
  Args:
32
- name: column name
30
+ name: column name; None for system columns (eg, index columns)
33
31
  col_type: column type; can be None if the type can be derived from ``computed_with``
34
32
  computed_with: a callable or an Expr object that computes the column value
35
- primary_key: if True, this column is part of the primary key
33
+ is_pk: if True, this column is part of the primary key
36
34
  stored: determines whether a computed column is present in the stored table or recomputed on demand
37
- indexed: if True, this column has a nearest neighbor index (only valid for image columns)
38
35
  col_id: column ID (only used internally)
39
36
 
40
37
  Computed columns: those have a non-None ``computed_with`` argument
41
-
42
38
  - when constructed by the user: ``computed_with`` was constructed explicitly and is passed in;
43
39
  col_type is None
44
40
  - when loaded from md store: ``computed_with`` is set and col_type is set
45
41
 
46
42
  ``computed_with`` is a Callable:
47
-
48
43
  - the callable's parameter names must correspond to existing columns in the table for which this Column
49
44
  is being used
50
45
  - ``col_type`` needs to be set to the callable's return type
51
46
 
52
47
  ``stored`` (only valid for computed image columns):
53
-
54
48
  - if True: the column is present in the stored table
55
49
  - if False: the column is not present in the stored table and recomputed during a query
56
50
  - if None: the system chooses for you (at present, this is always False, but this may change in the future)
57
-
58
- indexed: only valid for image columns; if true, maintains an NN index for this column
59
51
  """
60
- if not is_valid_identifier(name):
52
+ if name is not None and not is_valid_identifier(name):
61
53
  raise excs.Error(f"Invalid column name: '{name}'")
62
54
  self.name = name
63
55
  if col_type is None and computed_with is None:
@@ -90,35 +82,20 @@ class Column:
90
82
  self.stored = stored
91
83
  self.dependent_cols: Set[Column] = set() # cols with value_exprs that reference us; set by TableVersion
92
84
  self.id = col_id
93
- self.primary_key = primary_key
85
+ self.is_pk = is_pk
86
+ self.schema_version_add = schema_version_add
87
+ self.schema_version_drop = schema_version_drop
94
88
 
95
89
  # column in the stored table for the values of this Column
96
90
  self.sa_col: Optional[sql.schema.Column] = None
91
+ self.sa_col_type = sa_col_type
97
92
 
98
93
  # computed cols also have storage columns for the exception string and type
99
94
  self.sa_errormsg_col: Optional[sql.schema.Column] = None
100
95
  self.sa_errortype_col: Optional[sql.schema.Column] = None
101
- # indexed columns also have a column for the embeddings
102
- self.sa_idx_col: Optional[sql.schema.Column] = None
103
96
  from .table_version import TableVersion
104
97
  self.tbl: Optional[TableVersion] = None # set by owning TableVersion
105
98
 
106
- if indexed and not self.col_type.is_image_type():
107
- raise excs.Error(f'Column {name}: indexed=True requires ImageType')
108
- self.is_indexed = indexed
109
-
110
- @classmethod
111
- def from_md(cls, col_id: int, md: schema.SchemaColumn, tbl: 'TableVersion') -> Column:
112
- """Construct a Column from metadata.
113
-
114
- Leaves out value_expr, because that requires TableVersion.cols to be complete.
115
- """
116
- col = cls(
117
- md.name, col_type=ColumnType.from_dict(md.col_type), primary_key=md.is_pk,
118
- stored=md.stored, indexed=md.is_indexed, col_id=col_id)
119
- col.tbl = tbl
120
- return col
121
-
122
99
  def __hash__(self) -> int:
123
100
  assert self.tbl is not None
124
101
  return hash((self.tbl.id, self.id))
@@ -137,6 +114,10 @@ class Column:
137
114
  l = list(self.value_expr.subexprs(filter=lambda e: isinstance(e, exprs.FunctionCall) and e.is_window_fn_call))
138
115
  return len(l) > 0
139
116
 
117
+ def get_idx_info(self) -> dict[str, 'pixeltable.catalog.TableVersion.IndexInfo']:
118
+ assert self.tbl is not None
119
+ return {name: info for name, info in self.tbl.idxs_by_name.items() if info.col == self}
120
+
140
121
  @property
141
122
  def is_computed(self) -> bool:
142
123
  return self.compute_func is not None or self.value_expr is not None
@@ -167,26 +148,26 @@ class Column:
167
148
  """
168
149
  assert self.is_stored
169
150
  # all storage columns are nullable (we deal with null errors in Pixeltable directly)
170
- self.sa_col = sql.Column(self.storage_name(), self.col_type.to_sa_type(), nullable=True)
151
+ self.sa_col = sql.Column(
152
+ self.store_name(), self.col_type.to_sa_type() if self.sa_col_type is None else self.sa_col_type,
153
+ nullable=True)
171
154
  if self.is_computed or self.col_type.is_media_type():
172
- self.sa_errormsg_col = sql.Column(self.errormsg_storage_name(), StringType().to_sa_type(), nullable=True)
173
- self.sa_errortype_col = sql.Column(self.errortype_storage_name(), StringType().to_sa_type(), nullable=True)
174
- if self.is_indexed:
175
- self.sa_idx_col = sql.Column(self.index_storage_name(), Vector(512), nullable=True)
155
+ self.sa_errormsg_col = sql.Column(self.errormsg_store_name(), ts.StringType().to_sa_type(), nullable=True)
156
+ self.sa_errortype_col = sql.Column(self.errortype_store_name(), ts.StringType().to_sa_type(), nullable=True)
157
+
158
+ def get_sa_col_type(self) -> sql.sqltypes.TypeEngine:
159
+ return self.col_type.to_sa_type() if self.sa_col_type is None else self.sa_col_type
176
160
 
177
- def storage_name(self) -> str:
161
+ def store_name(self) -> str:
178
162
  assert self.id is not None
179
163
  assert self.is_stored
180
164
  return f'col_{self.id}'
181
165
 
182
- def errormsg_storage_name(self) -> str:
183
- return f'{self.storage_name()}_errormsg'
184
-
185
- def errortype_storage_name(self) -> str:
186
- return f'{self.storage_name()}_errortype'
166
+ def errormsg_store_name(self) -> str:
167
+ return f'{self.store_name()}_errormsg'
187
168
 
188
- def index_storage_name(self) -> str:
189
- return f'{self.storage_name()}_idx_0'
169
+ def errortype_store_name(self) -> str:
170
+ return f'{self.store_name()}_errortype'
190
171
 
191
172
  def __str__(self) -> str:
192
173
  return f'{self.name}: {self.col_type}'
@@ -11,14 +11,17 @@ import pixeltable.type_system as ts
11
11
  from pixeltable import exceptions as excs
12
12
  from pixeltable.env import Env
13
13
  from .catalog import Catalog
14
+ from .globals import UpdateStatus
14
15
  from .table import Table
15
16
  from .table_version import TableVersion
16
17
  from .table_version_path import TableVersionPath
17
18
 
18
19
  _logger = logging.getLogger('pixeltable')
19
20
 
21
+
20
22
  class InsertableTable(Table):
21
23
  """A `Table` that allows inserting and deleting rows."""
24
+
22
25
  def __init__(self, dir_id: UUID, tbl_version: TableVersion):
23
26
  tbl_version_path = TableVersionPath(tbl_version)
24
27
  super().__init__(tbl_version.id, dir_id, tbl_version.name, tbl_version_path)
@@ -42,7 +45,7 @@ class InsertableTable(Table):
42
45
  col = columns[column_names.index(pk_col)]
43
46
  if col.col_type.nullable:
44
47
  raise excs.Error(f'Primary key column {pk_col} cannot be nullable')
45
- col.primary_key = True
48
+ col.is_pk = True
46
49
 
47
50
  with orm.Session(Env.get().engine, future=True) as session:
48
51
  _, tbl_version = TableVersion.create(session, dir_id, name, columns, num_retained_versions, comment)
@@ -62,7 +65,7 @@ class InsertableTable(Table):
62
65
  @overload
63
66
  def insert(self, print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any): ...
64
67
 
65
- def insert(self, *args, **kwargs) -> Table.UpdateStatus:
68
+ def insert(self, *args, **kwargs) -> UpdateStatus:
66
69
  """Insert rows into table.
67
70
 
68
71
  To insert multiple rows at a time:
@@ -161,7 +164,7 @@ class InsertableTable(Table):
161
164
  msg = str(e)
162
165
  raise excs.Error(f'Error in column {col.name}: {msg[0].lower() + msg[1:]}\nRow: {row}')
163
166
 
164
- def delete(self, where: Optional['pixeltable.exprs.Predicate'] = None) -> Table.UpdateStatus:
167
+ def delete(self, where: Optional['pixeltable.exprs.Predicate'] = None) -> UpdateStatus:
165
168
  """Delete rows in this table.
166
169
 
167
170
  Args:
@@ -181,9 +184,7 @@ class InsertableTable(Table):
181
184
  if where is not None:
182
185
  if not isinstance(where, Predicate):
183
186
  raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
184
- analysis_info = Planner.analyze(self.tbl_version, where)
185
- if analysis_info.similarity_clause is not None:
186
- raise excs.Error('nearest() cannot be used with delete()')
187
+ analysis_info = Planner.analyze(self.tbl_version_path, where)
187
188
  # for now we require that the updated rows can be identified via SQL, rather than via a Python filter
188
189
  if analysis_info.filter is not None:
189
190
  raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
@@ -1,10 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- import dataclasses
4
3
  import json
5
4
  import logging
6
5
  from pathlib import Path
7
- from typing import Union, Any, List, Dict, Optional, Callable, Set, Tuple
6
+ from typing import Union, Any, List, Dict, Optional, Callable, Set, Tuple, Iterable
8
7
  from uuid import UUID
9
8
 
10
9
  import pandas as pd
@@ -18,7 +17,7 @@ import pixeltable.exprs as exprs
18
17
  import pixeltable.metadata.schema as schema
19
18
  import pixeltable.type_system as ts
20
19
  from .column import Column
21
- from .globals import is_valid_identifier, is_system_column_name
20
+ from .globals import is_valid_identifier, is_system_column_name, UpdateStatus
22
21
  from .schema_object import SchemaObject
23
22
  from .table_version import TableVersion
24
23
  from .table_version_path import TableVersionPath
@@ -28,14 +27,7 @@ _logger = logging.getLogger('pixeltable')
28
27
  class Table(SchemaObject):
29
28
  """Base class for all tabular SchemaObjects."""
30
29
 
31
- @dataclasses.dataclass
32
- class UpdateStatus:
33
- num_rows: int = 0
34
- # TODO: change to num_computed_columns (the number of computed slots isn't really meaningful to the user)
35
- num_computed_values: int = 0
36
- num_excs: int = 0
37
- updated_cols: List[str] = dataclasses.field(default_factory=list)
38
- cols_with_excs: List[str] = dataclasses.field(default_factory=list)
30
+ ROWID_COLUMN_NAME = '_rowid'
39
31
 
40
32
  def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath):
41
33
  super().__init__(id, name, dir_id)
@@ -105,6 +97,11 @@ class Table(SchemaObject):
105
97
  from pixeltable.dataframe import DataFrame
106
98
  return DataFrame(self.tbl_version_path).order_by(*items, asc=asc)
107
99
 
100
+ def group_by(self, *items: 'exprs.Expr') -> 'pixeltable.dataframe.DataFrame':
101
+ """Return a DataFrame for this table."""
102
+ from pixeltable.dataframe import DataFrame
103
+ return DataFrame(self.tbl_version_path).group_by(*items)
104
+
108
105
  def collect(self) -> 'pixeltable.dataframe.DataFrameResultSet': # type: ignore[name-defined, no-untyped-def]
109
106
  """Return rows from this table.
110
107
  """
@@ -225,7 +222,7 @@ class Table(SchemaObject):
225
222
  value: column type or value expression or column specification dictionary:
226
223
  column type: a Pixeltable column type (if the table already contains rows, it must be nullable)
227
224
  value expression: a Pixeltable expression that computes the column values
228
- column specification: a dictionary with possible keys 'type', 'value', 'stored', 'indexed'
225
+ column specification: a dictionary with possible keys 'type', 'value', 'stored'
229
226
  Examples:
230
227
  Add an int column with ``None`` values:
231
228
 
@@ -247,11 +244,6 @@ class Table(SchemaObject):
247
244
  Do the same, but now the column is stored:
248
245
 
249
246
  >>> tbl['rotated'] = {'value': tbl.frame.rotate(90), 'stored': True}
250
-
251
- Add a resized version of the ``frame`` column and index it. The column does not need to be stored in order
252
- to be indexed:
253
-
254
- >>> tbl['small_frame'] = {'value': tbl.frame.resize([224, 224]), 'indexed': True}
255
247
  """
256
248
  if not isinstance(column_name, str):
257
249
  raise excs.Error(f'Column name must be a string, got {type(column_name)}')
@@ -264,8 +256,8 @@ class Table(SchemaObject):
264
256
 
265
257
  def add_column(
266
258
  self, *,
267
- type: Optional[ts.ColumnType] = None, stored: Optional[bool] = None, indexed: Optional[bool] = None,
268
- print_stats: bool = False, **kwargs: Any
259
+ type: Optional[ts.ColumnType] = None, stored: Optional[bool] = None, print_stats: bool = False,
260
+ **kwargs: Any
269
261
  ) -> UpdateStatus:
270
262
  """Adds a column to the table.
271
263
 
@@ -273,7 +265,6 @@ class Table(SchemaObject):
273
265
  kwargs: Exactly one keyword argument of the form ``column-name=type|value-expression``.
274
266
  type: The type of the column. Only valid and required if ``value-expression`` is a Callable.
275
267
  stored: Whether the column is materialized and stored or computed on demand. Only valid for image columns.
276
- indexed: Whether the column is indexed.
277
268
  print_stats: If ``True``, print execution metrics.
278
269
 
279
270
  Returns:
@@ -318,15 +309,6 @@ class Table(SchemaObject):
318
309
  Alternatively, this can also be expressed as:
319
310
 
320
311
  >>> tbl['rotated'] = {'value': tbl.frame.rotate(90), 'stored': True}
321
-
322
- Add a resized version of the ``frame`` column and index it. The column does not need to be stored in order
323
- to be indexed:
324
-
325
- >>> tbl.add_column(small_frame=tbl.frame.resize([224, 224]), indexed=True)
326
-
327
- Alternatively, this can also be expressed as:
328
-
329
- >>> tbl['small_frame'] = {'value': tbl.frame.resize([224, 224]), 'indexed': True}
330
312
  """
331
313
  self._check_is_dropped()
332
314
  # verify kwargs and construct column schema dict
@@ -349,8 +331,6 @@ class Table(SchemaObject):
349
331
  col_schema['type'] = type
350
332
  if stored is not None:
351
333
  col_schema['stored'] = stored
352
- if indexed is not None:
353
- col_schema['indexed'] = indexed
354
334
 
355
335
  new_col = self._create_columns({col_name: col_schema})[0]
356
336
  self._verify_column(new_col, self.column_names())
@@ -364,7 +344,7 @@ class Table(SchemaObject):
364
344
  (on account of containing Python Callables or Exprs).
365
345
  """
366
346
  assert isinstance(spec, dict)
367
- valid_keys = {'type', 'value', 'stored', 'indexed'}
347
+ valid_keys = {'type', 'value', 'stored'}
368
348
  has_type = False
369
349
  for k in spec.keys():
370
350
  if k not in valid_keys:
@@ -393,8 +373,6 @@ class Table(SchemaObject):
393
373
 
394
374
  if 'stored' in spec and not isinstance(spec['stored'], bool):
395
375
  raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
396
- if 'indexed' in spec and not isinstance(spec['indexed'], bool):
397
- raise excs.Error(f'Column {name}: "indexed" must be a bool, got {spec["indexed"]}')
398
376
  if not has_type:
399
377
  raise excs.Error(f'Column {name}: "type" is required')
400
378
 
@@ -406,7 +384,6 @@ class Table(SchemaObject):
406
384
  col_type: Optional[ts.ColumnType] = None
407
385
  value_expr: Optional[exprs.Expr] = None
408
386
  stored: Optional[bool] = None
409
- indexed: Optional[bool] = None
410
387
  primary_key: Optional[bool] = None
411
388
 
412
389
  if isinstance(spec, ts.ColumnType):
@@ -428,12 +405,10 @@ class Table(SchemaObject):
428
405
  # create copy so we can modify it
429
406
  value_expr = value_expr.copy()
430
407
  stored = spec.get('stored')
431
- indexed = spec.get('indexed')
432
408
  primary_key = spec.get('primary_key')
433
409
 
434
410
  column = Column(
435
- name, col_type=col_type, computed_with=value_expr, stored=stored, indexed=indexed,
436
- primary_key=primary_key)
411
+ name, col_type=col_type, computed_with=value_expr, stored=stored, is_pk=primary_key)
437
412
  columns.append(column)
438
413
  return columns
439
414
 
@@ -498,9 +473,90 @@ class Table(SchemaObject):
498
473
  self._check_is_dropped()
499
474
  self.tbl_version_path.tbl_version.rename_column(old_name, new_name)
500
475
 
476
+ def add_embedding_index(
477
+ self, col_name: str, *, idx_name: Optional[str] = None,
478
+ text_embed: Optional[pixeltable.Function] = None, img_embed: Optional[pixeltable.Function] = None,
479
+ metric: str = 'cosine'
480
+ ) -> None:
481
+ """Add an index to the table.
482
+ Args:
483
+ col_name: name of column to index
484
+ idx_name: name of index, which needs to be unique for the table; if not provided, a name will be generated
485
+ text_embed: function to embed text; required if the column is a text column
486
+ img_embed: function to embed images; required if the column is an image column
487
+ metric: distance metric to use for the index; one of 'cosine', 'ip', 'l2'; default is 'cosine'
488
+
489
+ Raises:
490
+ Error: If an index with that name already exists for the table or if the column does not exist.
491
+
492
+ Examples:
493
+ Add an index to the ``img`` column:
494
+
495
+ >>> tbl.add_embedding_index('img', img_embed=...)
496
+
497
+ Add another index to the ``img`` column, using the inner product as the distance metric,
498
+ and with a specific name; ``text_embed`` is also specified in order to search with text:
499
+
500
+ >>> tbl.add_embedding_index(
501
+ 'img', idx_name='clip_idx', img_embed=..., text_embed=...text_embed..., metric='ip')
502
+ """
503
+ if self.tbl_version_path.is_snapshot():
504
+ raise excs.Error('Cannot add an index to a snapshot')
505
+ self._check_is_dropped()
506
+ col = self.tbl_version_path.get_column(col_name, include_bases=True)
507
+ if col is None:
508
+ raise excs.Error(f'Column {col_name} unknown')
509
+ if idx_name is not None and idx_name in self.tbl_version_path.tbl_version.idxs_by_name:
510
+ raise excs.Error(f'Duplicate index name: {idx_name}')
511
+ from pixeltable.index import EmbeddingIndex
512
+ # create the EmbeddingIndex instance to verify args
513
+ idx = EmbeddingIndex(col, metric=metric, text_embed=text_embed, img_embed=img_embed)
514
+ status = self.tbl_version_path.tbl_version.add_index(col, idx_name=idx_name, idx=idx)
515
+ # TODO: how to deal with exceptions here? drop the index and raise?
516
+
517
+ def drop_index(self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None) -> None:
518
+ """Drop an index from the table.
519
+
520
+ Args:
521
+ column_name: The name of the column whose index to drop. Invalid if the column has multiple indices.
522
+ idx_name: The name of the index to drop.
523
+
524
+ Raises:
525
+ Error: If the index does not exist.
526
+
527
+ Examples:
528
+ Drop index on the ``img`` column:
529
+
530
+ >>> tbl.drop_index(column_name='img')
531
+ """
532
+ if self.tbl_version_path.is_snapshot():
533
+ raise excs.Error('Cannot drop an index from a snapshot')
534
+ self._check_is_dropped()
535
+ if (column_name is None) == (idx_name is None):
536
+ raise excs.Error('Exactly one of column_name or idx_name must be provided')
537
+ tbl_version = self.tbl_version_path.tbl_version
538
+
539
+ if idx_name is not None:
540
+ if idx_name not in tbl_version.idxs_by_name:
541
+ raise excs.Error(f'Index {idx_name} does not exist')
542
+ idx_id = tbl_version.idxs_by_name[idx_name].id
543
+ else:
544
+ col = self.tbl_version_path.get_column(column_name, include_bases=True)
545
+ if col is None:
546
+ raise excs.Error(f'Column {column_name} unknown')
547
+ if col.tbl.id != tbl_version.id:
548
+ raise excs.Error(
549
+ f'Column {column_name}: cannot drop index from column that belongs to base ({col.tbl.name})')
550
+ idx_ids = [info.id for info in tbl_version.idxs_by_name.values() if info.col.id == col.id]
551
+ if len(idx_ids) == 0:
552
+ raise excs.Error(f'Column {column_name} does not have an index')
553
+ if len(idx_ids) > 1:
554
+ raise excs.Error(f'Column {column_name} has multiple indices; specify idx_name instead')
555
+ idx_id = idx_ids[0]
556
+ self.tbl_version_path.tbl_version.drop_index(idx_id)
557
+
501
558
  def update(
502
- self, value_spec: Dict[str, Union['pixeltable.exprs.Expr', Any]],
503
- where: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
559
+ self, value_spec: dict[str, Any], where: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
504
560
  ) -> UpdateStatus:
505
561
  """Update rows in this table.
506
562
 
@@ -510,11 +566,11 @@ class Table(SchemaObject):
510
566
  cascade: if True, also update all computed columns that transitively depend on the updated columns.
511
567
 
512
568
  Examples:
513
- Set newly-added column `int_col` to 1 for all rows:
569
+ Set column `int_col` to 1 for all rows:
514
570
 
515
571
  >>> tbl.update({'int_col': 1})
516
572
 
517
- Set newly-added column `int_col` to 1 for all rows where `int_col` is 0:
573
+ Set column `int_col` to 1 for all rows where `int_col` is 0:
518
574
 
519
575
  >>> tbl.update({'int_col': 1}, where=tbl.int_col == 0)
520
576
 
@@ -526,27 +582,93 @@ class Table(SchemaObject):
526
582
 
527
583
  >>> tbl.update({'int_col': tbl.int_col + 1}, where=tbl.int_col == 0)
528
584
  """
585
+ if self.tbl_version_path.is_snapshot():
586
+ raise excs.Error('Cannot update a snapshot')
587
+ self._check_is_dropped()
588
+
589
+ update_spec = self._validate_update_spec(value_spec, allow_pk=False, allow_exprs=True)
590
+ from pixeltable.plan import Planner
591
+ if where is not None:
592
+ if not isinstance(where, exprs.Predicate):
593
+ raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
594
+ analysis_info = Planner.analyze(self.tbl_version_path, where)
595
+ # for now we require that the updated rows can be identified via SQL, rather than via a Python filter
596
+ if analysis_info.filter is not None:
597
+ raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
598
+
599
+ return self.tbl_version_path.tbl_version.update(update_spec, where, cascade)
600
+
601
+ def batch_update(self, rows: Iterable[dict[str, Any]], cascade: bool = True) -> UpdateStatus:
602
+ """Update rows in this table.
603
+
604
+ Args:
605
+ rows: an Iterable of dictionaries containing values for the updated columns plus values for the primary key
606
+ columns.
607
+ cascade: if True, also update all computed columns that transitively depend on the updated columns.
608
+
609
+ Examples:
610
+ Update the 'name' and 'age' columns for the rows with ids 1 and 2 (assuming 'id' is the primary key):
611
+
612
+ >>> tbl.update([{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 2, 'name': 'Bob', 'age': 40}])
613
+ """
614
+ if self.tbl_version_path.is_snapshot():
615
+ raise excs.Error('Cannot update a snapshot')
616
+ self._check_is_dropped()
617
+
618
+ row_updates: List[Dict[Column, exprs.Expr]] = []
619
+ pk_col_names = set(c.name for c in self.tbl_version_path.tbl_version.primary_key_columns())
620
+
621
+ # pseudo-column _rowid: contains the rowid of the row to update and can be used instead of the primary key
622
+ has_rowid = self.ROWID_COLUMN_NAME in rows[0]
623
+ rowids: list[Tuple[int, ...]] = []
624
+ if len(pk_col_names) == 0 and not has_rowid:
625
+ raise excs.Error('Table must have primary key for batch update')
626
+
627
+ for row_spec in rows:
628
+ col_vals = self._validate_update_spec(row_spec, allow_pk=not has_rowid, allow_exprs=False)
629
+ if has_rowid:
630
+ # we expect the _rowid column to be present for each row
631
+ assert self.ROWID_COLUMN_NAME in row_spec
632
+ rowids.append(row_spec[self.ROWID_COLUMN_NAME])
633
+ else:
634
+ col_names = set(col.name for col in col_vals.keys())
635
+ if any(pk_col_name not in col_names for pk_col_name in pk_col_names):
636
+ missing_cols = pk_col_names - set(col.name for col in col_vals.keys())
637
+ raise excs.Error(f'Primary key columns ({", ".join(missing_cols)}) missing in {row_spec}')
638
+ row_updates.append(col_vals)
639
+ return self.tbl_version_path.tbl_version.batch_update(row_updates, rowids, cascade)
640
+
641
+ def _validate_update_spec(
642
+ self, value_spec: dict[str, Any], allow_pk: bool, allow_exprs: bool
643
+ ) -> dict[Column, 'pixeltable.exprs.Expr']:
529
644
  from pixeltable import exprs
530
- update_targets: List[Tuple[Column, exprs.Expr]] = []
645
+ update_targets: dict[Column, exprs.Expr] = {}
531
646
  for col_name, val in value_spec.items():
532
647
  if not isinstance(col_name, str):
533
648
  raise excs.Error(f'Update specification: dict key must be column name, got {col_name!r}')
649
+ if col_name == self.ROWID_COLUMN_NAME:
650
+ # ignore pseudo-column _rowid
651
+ continue
534
652
  col = self.tbl_version_path.get_column(col_name, include_bases=False)
535
653
  if col is None:
536
654
  # TODO: return more informative error if this is trying to update a base column
537
655
  raise excs.Error(f'Column {col_name} unknown')
538
656
  if col.is_computed:
539
657
  raise excs.Error(f'Column {col_name} is computed and cannot be updated')
540
- if col.primary_key:
658
+ if col.is_pk and not allow_pk:
541
659
  raise excs.Error(f'Column {col_name} is a primary key column and cannot be updated')
542
660
  if col.col_type.is_media_type():
543
661
  raise excs.Error(f'Column {col_name} has type image/video/audio/document and cannot be updated')
544
662
 
545
663
  # make sure that the value is compatible with the column type
546
- # check if this is a literal
547
664
  try:
665
+ # check if this is a literal
548
666
  value_expr = exprs.Literal(val, col_type=col.col_type)
549
667
  except TypeError:
668
+ if not allow_exprs:
669
+ raise excs.Error(
670
+ f'Column {col_name}: value {val!r} is not a valid literal for this column '
671
+ f'(expected {col.col_type})')
550
672
  # it's not a literal, let's try to create an expr from it
551
673
  value_expr = exprs.Expr.from_object(val)
552
674
  if value_expr is None:
@@ -556,20 +678,10 @@ class Table(SchemaObject):
556
678
  f'Type of value {val!r} ({value_expr.col_type}) is not compatible with the type of column '
557
679
  f'{col_name} ({col.col_type})'
558
680
  ))
559
- update_targets.append((col, value_expr))
681
+ update_targets[col] = value_expr
560
682
 
561
- from pixeltable.plan import Planner
562
- if where is not None:
563
- if not isinstance(where, exprs.Predicate):
564
- raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
565
- analysis_info = Planner.analyze(self.tbl_version_path, where)
566
- if analysis_info.similarity_clause is not None:
567
- raise excs.Error('nearest() cannot be used with update()')
568
- # for now we require that the updated rows can be identified via SQL, rather than via a Python filter
569
- if analysis_info.filter is not None:
570
- raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
683
+ return update_targets
571
684
 
572
- return self.tbl_version_path.tbl_version.update(update_targets, where, cascade)
573
685
 
574
686
  def revert(self) -> None:
575
687
  """Reverts the table to the previous version.
@@ -577,5 +689,7 @@ class Table(SchemaObject):
577
689
  .. warning::
578
690
  This operation is irreversible.
579
691
  """
692
+ if self.tbl_version_path.is_snapshot():
693
+ raise excs.Error('Cannot revert a snapshot')
580
694
  self._check_is_dropped()
581
695
  self.tbl_version_path.tbl_version.revert()