pixeltable 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +53 -0
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +181 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +192 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +695 -0
- pixeltable/catalog/table_version.py +1026 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/dataframe.py +749 -0
- pixeltable/env.py +466 -0
- pixeltable/exceptions.py +17 -0
- pixeltable/exec/__init__.py +10 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +116 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +94 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +73 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +226 -0
- pixeltable/exprs/__init__.py +25 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +114 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +199 -0
- pixeltable/exprs/expr.py +594 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +382 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +96 -0
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +109 -0
- pixeltable/exprs/inline_dict.py +103 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +66 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +329 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/similarity_expr.py +65 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/__init__.py +7 -0
- pixeltable/func/aggregate_function.py +197 -0
- pixeltable/func/callable_function.py +113 -0
- pixeltable/func/expr_template_function.py +99 -0
- pixeltable/func/function.py +141 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +46 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +162 -0
- pixeltable/func/udf.py +164 -0
- pixeltable/functions/__init__.py +95 -0
- pixeltable/functions/eval.py +215 -0
- pixeltable/functions/fireworks.py +34 -0
- pixeltable/functions/huggingface.py +167 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +289 -0
- pixeltable/functions/pil/image.py +147 -0
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +143 -0
- pixeltable/functions/util.py +52 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/globals.py +425 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +51 -0
- pixeltable/index/embedding_index.py +168 -0
- pixeltable/io/__init__.py +3 -0
- pixeltable/io/hf_datasets.py +188 -0
- pixeltable/io/pandas.py +148 -0
- pixeltable/io/parquet.py +192 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +52 -0
- pixeltable/iterators/document.py +432 -0
- pixeltable/iterators/video.py +88 -0
- pixeltable/metadata/__init__.py +58 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/schema.py +234 -0
- pixeltable/plan.py +620 -0
- pixeltable/store.py +424 -0
- pixeltable/tool/create_test_db_dump.py +184 -0
- pixeltable/tool/create_test_video.py +81 -0
- pixeltable/type_system.py +846 -0
- pixeltable/utils/__init__.py +17 -0
- pixeltable/utils/arrow.py +98 -0
- pixeltable/utils/clip.py +18 -0
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +69 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/http_server.py +70 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/pytorch.py +91 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.0.0.dist-info/LICENSE +18 -0
- pixeltable-0.0.0.dist-info/METADATA +131 -0
- pixeltable-0.0.0.dist-info/RECORD +119 -0
- pixeltable-0.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,695 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Union, Any, List, Dict, Optional, Callable, Set, Tuple, Iterable
|
|
7
|
+
from uuid import UUID
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import sqlalchemy as sql
|
|
11
|
+
|
|
12
|
+
import pixeltable
|
|
13
|
+
import pixeltable.catalog as catalog
|
|
14
|
+
import pixeltable.env as env
|
|
15
|
+
import pixeltable.exceptions as excs
|
|
16
|
+
import pixeltable.exprs as exprs
|
|
17
|
+
import pixeltable.metadata.schema as schema
|
|
18
|
+
import pixeltable.type_system as ts
|
|
19
|
+
from .column import Column
|
|
20
|
+
from .globals import is_valid_identifier, is_system_column_name, UpdateStatus
|
|
21
|
+
from .schema_object import SchemaObject
|
|
22
|
+
from .table_version import TableVersion
|
|
23
|
+
from .table_version_path import TableVersionPath
|
|
24
|
+
|
|
25
|
+
_logger = logging.getLogger('pixeltable')
|
|
26
|
+
|
|
27
|
+
class Table(SchemaObject):
|
|
28
|
+
"""Base class for all tabular SchemaObjects."""
|
|
29
|
+
|
|
30
|
+
ROWID_COLUMN_NAME = '_rowid'
|
|
31
|
+
|
|
32
|
+
def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath):
|
|
33
|
+
super().__init__(id, name, dir_id)
|
|
34
|
+
self.is_dropped = False
|
|
35
|
+
self.tbl_version_path = tbl_version_path
|
|
36
|
+
|
|
37
|
+
def move(self, new_name: str, new_dir_id: UUID) -> None:
|
|
38
|
+
super().move(new_name, new_dir_id)
|
|
39
|
+
with env.Env.get().engine.begin() as conn:
|
|
40
|
+
stmt = sql.text((
|
|
41
|
+
f"UPDATE {schema.Table.__table__} "
|
|
42
|
+
f"SET {schema.Table.dir_id.name} = :new_dir_id, "
|
|
43
|
+
f" {schema.Table.md.name}['name'] = :new_name "
|
|
44
|
+
f"WHERE {schema.Table.id.name} = :id"))
|
|
45
|
+
conn.execute(stmt, {'new_dir_id': new_dir_id, 'new_name': json.dumps(new_name), 'id': self._id})
|
|
46
|
+
|
|
47
|
+
def version(self) -> int:
|
|
48
|
+
"""Return the version of this table. Used by tests to ascertain version changes."""
|
|
49
|
+
return self.tbl_version_path.tbl_version.version
|
|
50
|
+
|
|
51
|
+
def _tbl_version(self) -> TableVersion:
|
|
52
|
+
"""Return TableVersion for just this table."""
|
|
53
|
+
return self.tbl_version_path.tbl_version
|
|
54
|
+
|
|
55
|
+
def __hash__(self) -> int:
|
|
56
|
+
return hash(self._tbl_version().id)
|
|
57
|
+
|
|
58
|
+
def _check_is_dropped(self) -> None:
|
|
59
|
+
if self.is_dropped:
|
|
60
|
+
raise excs.Error(f'{self.display_name()} {self.name} has been dropped')
|
|
61
|
+
|
|
62
|
+
def __getattr__(self, col_name: str) -> 'pixeltable.exprs.ColumnRef':
|
|
63
|
+
"""Return a ColumnRef for the given column name.
|
|
64
|
+
"""
|
|
65
|
+
return getattr(self.tbl_version_path, col_name)
|
|
66
|
+
|
|
67
|
+
def __getitem__(self, index: object) -> Union['pixeltable.exprs.ColumnRef', 'pixeltable.dataframe.DataFrame']:
|
|
68
|
+
"""Return a ColumnRef for the given column name, or a DataFrame for the given slice.
|
|
69
|
+
"""
|
|
70
|
+
return self.tbl_version_path.__getitem__(index)
|
|
71
|
+
|
|
72
|
+
def df(self) -> 'pixeltable.dataframe.DataFrame':
|
|
73
|
+
"""Return a DataFrame for this table.
|
|
74
|
+
"""
|
|
75
|
+
# local import: avoid circular imports
|
|
76
|
+
from pixeltable.dataframe import DataFrame
|
|
77
|
+
return DataFrame(self.tbl_version_path)
|
|
78
|
+
|
|
79
|
+
def select(self, *items: Any, **named_items: Any) -> 'pixeltable.dataframe.DataFrame':
|
|
80
|
+
"""Return a DataFrame for this table.
|
|
81
|
+
"""
|
|
82
|
+
# local import: avoid circular imports
|
|
83
|
+
from pixeltable.dataframe import DataFrame
|
|
84
|
+
return DataFrame(self.tbl_version_path).select(*items, **named_items)
|
|
85
|
+
|
|
86
|
+
def where(self, pred: 'exprs.Predicate') -> 'pixeltable.dataframe.DataFrame':
|
|
87
|
+
"""Return a DataFrame for this table.
|
|
88
|
+
"""
|
|
89
|
+
# local import: avoid circular imports
|
|
90
|
+
from pixeltable.dataframe import DataFrame
|
|
91
|
+
return DataFrame(self.tbl_version_path).where(pred)
|
|
92
|
+
|
|
93
|
+
def order_by(self, *items: 'exprs.Expr', asc: bool = True) -> 'pixeltable.dataframe.DataFrame':
|
|
94
|
+
"""Return a DataFrame for this table.
|
|
95
|
+
"""
|
|
96
|
+
# local import: avoid circular imports
|
|
97
|
+
from pixeltable.dataframe import DataFrame
|
|
98
|
+
return DataFrame(self.tbl_version_path).order_by(*items, asc=asc)
|
|
99
|
+
|
|
100
|
+
def group_by(self, *items: 'exprs.Expr') -> 'pixeltable.dataframe.DataFrame':
|
|
101
|
+
"""Return a DataFrame for this table."""
|
|
102
|
+
from pixeltable.dataframe import DataFrame
|
|
103
|
+
return DataFrame(self.tbl_version_path).group_by(*items)
|
|
104
|
+
|
|
105
|
+
def collect(self) -> 'pixeltable.dataframe.DataFrameResultSet': # type: ignore[name-defined, no-untyped-def]
|
|
106
|
+
"""Return rows from this table.
|
|
107
|
+
"""
|
|
108
|
+
return self.df().collect()
|
|
109
|
+
|
|
110
|
+
def show(
|
|
111
|
+
self, *args, **kwargs
|
|
112
|
+
) -> 'pixeltable.dataframe.DataFrameResultSet': # type: ignore[name-defined, no-untyped-def]
|
|
113
|
+
"""Return rows from this table.
|
|
114
|
+
"""
|
|
115
|
+
return self.df().show(*args, **kwargs)
|
|
116
|
+
|
|
117
|
+
def head(
|
|
118
|
+
self, *args, **kwargs
|
|
119
|
+
) -> 'pixeltable.dataframe.DataFrameResultSet': # type: ignore[name-defined, no-untyped-def]
|
|
120
|
+
"""Return the first n rows inserted into this table."""
|
|
121
|
+
return self.df().head(*args, **kwargs)
|
|
122
|
+
|
|
123
|
+
def tail(
|
|
124
|
+
self, *args, **kwargs
|
|
125
|
+
) -> 'pixeltable.dataframe.DataFrameResultSet': # type: ignore[name-defined, no-untyped-def]
|
|
126
|
+
"""Return the last n rows inserted into this table."""
|
|
127
|
+
return self.df().tail(*args, **kwargs)
|
|
128
|
+
|
|
129
|
+
def count(self) -> int:
|
|
130
|
+
"""Return the number of rows in this table."""
|
|
131
|
+
return self.df().count()
|
|
132
|
+
|
|
133
|
+
def column_names(self) -> List[str]:
|
|
134
|
+
"""Return the names of the columns in this table."""
|
|
135
|
+
return [c.name for c in self.tbl_version_path.columns()]
|
|
136
|
+
|
|
137
|
+
def column_types(self) -> Dict[str, ts.ColumnType]:
|
|
138
|
+
"""Return the names of the columns in this table."""
|
|
139
|
+
return {c.name: c.col_type for c in self.tbl_version_path.columns()}
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def comment(self) -> str:
|
|
143
|
+
return self.tbl_version.comment
|
|
144
|
+
|
|
145
|
+
@comment.setter
|
|
146
|
+
def comment(self, new_comment: Optional[str]):
|
|
147
|
+
self.tbl_version.set_comment(new_comment)
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def num_retained_versions(self):
|
|
151
|
+
return self.tbl_version.num_retained_versions
|
|
152
|
+
|
|
153
|
+
@num_retained_versions.setter
|
|
154
|
+
def num_retained_versions(self, new_num_retained_versions: int):
|
|
155
|
+
self.tbl_version.set_num_retained_versions(new_num_retained_versions)
|
|
156
|
+
|
|
157
|
+
def _description(self) -> pd.DataFrame:
|
|
158
|
+
cols = self.tbl_version_path.columns()
|
|
159
|
+
df = pd.DataFrame({
|
|
160
|
+
'Column Name': [c.name for c in cols],
|
|
161
|
+
'Type': [str(c.col_type) for c in cols],
|
|
162
|
+
'Computed With': [c.value_expr.display_str(inline=False) if c.value_expr is not None else '' for c in cols],
|
|
163
|
+
})
|
|
164
|
+
return df
|
|
165
|
+
|
|
166
|
+
def _description_html(self) -> pd.DataFrame:
|
|
167
|
+
pd_df = self._description()
|
|
168
|
+
# white-space: pre-wrap: print \n as newline
|
|
169
|
+
# th: center-align headings
|
|
170
|
+
return pd_df.style.set_properties(**{'white-space': 'pre-wrap', 'text-align': 'left'}) \
|
|
171
|
+
.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]) \
|
|
172
|
+
.hide(axis='index')
|
|
173
|
+
|
|
174
|
+
def describe(self) -> None:
|
|
175
|
+
try:
|
|
176
|
+
__IPYTHON__
|
|
177
|
+
from IPython.display import display
|
|
178
|
+
display(self._description_html())
|
|
179
|
+
except NameError:
|
|
180
|
+
print(self.__repr__())
|
|
181
|
+
|
|
182
|
+
# TODO: Display comments in _repr_html()
|
|
183
|
+
def __repr__(self) -> str:
|
|
184
|
+
description_str = self._description().to_string(index=False)
|
|
185
|
+
if self.comment is None:
|
|
186
|
+
comment = ''
|
|
187
|
+
else:
|
|
188
|
+
comment = f'{self.comment}\n'
|
|
189
|
+
return f'{self.display_name()} \'{self._name}\'\n{comment}{description_str}'
|
|
190
|
+
|
|
191
|
+
def _repr_html_(self) -> str:
|
|
192
|
+
return self._description_html()._repr_html_()
|
|
193
|
+
|
|
194
|
+
def _drop(self) -> None:
|
|
195
|
+
self._check_is_dropped()
|
|
196
|
+
self.tbl_version_path.tbl_version.drop()
|
|
197
|
+
self.is_dropped = True
|
|
198
|
+
# update catalog
|
|
199
|
+
cat = catalog.Catalog.get()
|
|
200
|
+
del cat.tbls[self._id]
|
|
201
|
+
|
|
202
|
+
# TODO Factor this out into a separate module.
|
|
203
|
+
# The return type is unresolvable, but torch can't be imported since it's an optional dependency.
|
|
204
|
+
def to_pytorch_dataset(self, image_format : str = 'pt') -> 'torch.utils.data.IterableDataset':
|
|
205
|
+
"""Return a PyTorch Dataset for this table.
|
|
206
|
+
See DataFrame.to_pytorch_dataset()
|
|
207
|
+
"""
|
|
208
|
+
from pixeltable.dataframe import DataFrame
|
|
209
|
+
return DataFrame(self.tbl_version_path).to_pytorch_dataset(image_format=image_format)
|
|
210
|
+
|
|
211
|
+
def to_coco_dataset(self) -> Path:
|
|
212
|
+
"""Return the path to a COCO json file for this table.
|
|
213
|
+
See DataFrame.to_coco_dataset()
|
|
214
|
+
"""
|
|
215
|
+
from pixeltable.dataframe import DataFrame
|
|
216
|
+
return DataFrame(self.tbl_version_path).to_coco_dataset()
|
|
217
|
+
|
|
218
|
+
def __setitem__(self, column_name: str, value: Union[ts.ColumnType, exprs.Expr, Callable, dict]) -> None:
|
|
219
|
+
"""Adds a column to the table
|
|
220
|
+
Args:
|
|
221
|
+
column_name: the name of the new column
|
|
222
|
+
value: column type or value expression or column specification dictionary:
|
|
223
|
+
column type: a Pixeltable column type (if the table already contains rows, it must be nullable)
|
|
224
|
+
value expression: a Pixeltable expression that computes the column values
|
|
225
|
+
column specification: a dictionary with possible keys 'type', 'value', 'stored'
|
|
226
|
+
Examples:
|
|
227
|
+
Add an int column with ``None`` values:
|
|
228
|
+
|
|
229
|
+
>>> tbl['new_col'] = IntType(nullable=True)
|
|
230
|
+
|
|
231
|
+
For a table with int column ``int_col``, add a column that is the factorial of ``int_col``. The names of
|
|
232
|
+
the parameters of the Callable must correspond to existing column names (the column values are then passed
|
|
233
|
+
as arguments to the Callable). In this case, the return type cannot be inferred and needs to be specified
|
|
234
|
+
explicitly:
|
|
235
|
+
|
|
236
|
+
>>> tbl['factorial'] = {'value': lambda int_col: math.factorial(int_col), 'type': IntType()}
|
|
237
|
+
|
|
238
|
+
For a table with an image column ``frame``, add an image column ``rotated`` that rotates the image by
|
|
239
|
+
90 degrees. In this case, the column type is inferred from the expression. Also, the column is not stored
|
|
240
|
+
(by default, computed image columns are not stored but recomputed on demand):
|
|
241
|
+
|
|
242
|
+
>>> tbl['rotated'] = tbl.frame.rotate(90)
|
|
243
|
+
|
|
244
|
+
Do the same, but now the column is stored:
|
|
245
|
+
|
|
246
|
+
>>> tbl['rotated'] = {'value': tbl.frame.rotate(90), 'stored': True}
|
|
247
|
+
"""
|
|
248
|
+
if not isinstance(column_name, str):
|
|
249
|
+
raise excs.Error(f'Column name must be a string, got {type(column_name)}')
|
|
250
|
+
if not is_valid_identifier(column_name):
|
|
251
|
+
raise excs.Error(f'Invalid column name: {column_name!r}')
|
|
252
|
+
|
|
253
|
+
new_col = self._create_columns({column_name: value})[0]
|
|
254
|
+
self._verify_column(new_col, self.column_names())
|
|
255
|
+
return self.tbl_version_path.tbl_version.add_column(new_col)
|
|
256
|
+
|
|
257
|
+
def add_column(
|
|
258
|
+
self, *,
|
|
259
|
+
type: Optional[ts.ColumnType] = None, stored: Optional[bool] = None, print_stats: bool = False,
|
|
260
|
+
**kwargs: Any
|
|
261
|
+
) -> UpdateStatus:
|
|
262
|
+
"""Adds a column to the table.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
kwargs: Exactly one keyword argument of the form ``column-name=type|value-expression``.
|
|
266
|
+
type: The type of the column. Only valid and required if ``value-expression`` is a Callable.
|
|
267
|
+
stored: Whether the column is materialized and stored or computed on demand. Only valid for image columns.
|
|
268
|
+
print_stats: If ``True``, print execution metrics.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
execution status
|
|
272
|
+
|
|
273
|
+
Raises:
|
|
274
|
+
Error: If the column name is invalid or already exists.
|
|
275
|
+
|
|
276
|
+
Examples:
|
|
277
|
+
Add an int column with ``None`` values:
|
|
278
|
+
|
|
279
|
+
>>> tbl.add_column(new_col=IntType())
|
|
280
|
+
|
|
281
|
+
Alternatively, this can also be expressed as:
|
|
282
|
+
|
|
283
|
+
>>> tbl['new_col'] = IntType()
|
|
284
|
+
|
|
285
|
+
For a table with int column ``int_col``, add a column that is the factorial of ``int_col``. The names of
|
|
286
|
+
the parameters of the Callable must correspond to existing column names (the column values are then passed
|
|
287
|
+
as arguments to the Callable). In this case, the column type needs to be specified explicitly:
|
|
288
|
+
|
|
289
|
+
>>> tbl.add_column(factorial=lambda int_col: math.factorial(int_col), type=IntType())
|
|
290
|
+
|
|
291
|
+
Alternatively, this can also be expressed as:
|
|
292
|
+
|
|
293
|
+
>>> tbl['factorial'] = {'value': lambda int_col: math.factorial(int_col), 'type': IntType()}
|
|
294
|
+
|
|
295
|
+
For a table with an image column ``frame``, add an image column ``rotated`` that rotates the image by
|
|
296
|
+
90 degrees. In this case, the column type is inferred from the expression. Also, the column is not stored
|
|
297
|
+
(by default, computed image columns are not stored but recomputed on demand):
|
|
298
|
+
|
|
299
|
+
>>> tbl.add_column(rotated=tbl.frame.rotate(90))
|
|
300
|
+
|
|
301
|
+
Alternatively, this can also be expressed as:
|
|
302
|
+
|
|
303
|
+
>>> tbl['rotated'] = tbl.frame.rotate(90)
|
|
304
|
+
|
|
305
|
+
Do the same, but now the column is stored:
|
|
306
|
+
|
|
307
|
+
>>> tbl.add_column(rotated=tbl.frame.rotate(90), stored=True)
|
|
308
|
+
|
|
309
|
+
Alternatively, this can also be expressed as:
|
|
310
|
+
|
|
311
|
+
>>> tbl['rotated'] = {'value': tbl.frame.rotate(90), 'stored': True}
|
|
312
|
+
"""
|
|
313
|
+
self._check_is_dropped()
|
|
314
|
+
# verify kwargs and construct column schema dict
|
|
315
|
+
if len(kwargs) != 1:
|
|
316
|
+
raise excs.Error((
|
|
317
|
+
f'add_column() requires exactly one keyword argument of the form "column-name=type|value-expression", '
|
|
318
|
+
f'got {len(kwargs)} instead ({", ".join(list(kwargs.keys()))})'
|
|
319
|
+
))
|
|
320
|
+
col_name, spec = next(iter(kwargs.items()))
|
|
321
|
+
col_schema: Dict[str, Any] = {}
|
|
322
|
+
if isinstance(spec, ts.ColumnType):
|
|
323
|
+
if type is not None:
|
|
324
|
+
raise excs.Error(f'add_column(): keyword argument "type" is redundant')
|
|
325
|
+
col_schema['type'] = spec
|
|
326
|
+
else:
|
|
327
|
+
if isinstance(spec, exprs.Expr) and type is not None:
|
|
328
|
+
raise excs.Error(f'add_column(): keyword argument "type" is redundant')
|
|
329
|
+
col_schema['value'] = spec
|
|
330
|
+
if type is not None:
|
|
331
|
+
col_schema['type'] = type
|
|
332
|
+
if stored is not None:
|
|
333
|
+
col_schema['stored'] = stored
|
|
334
|
+
|
|
335
|
+
new_col = self._create_columns({col_name: col_schema})[0]
|
|
336
|
+
self._verify_column(new_col, self.column_names())
|
|
337
|
+
return self.tbl_version_path.tbl_version.add_column(new_col, print_stats=print_stats)
|
|
338
|
+
|
|
339
|
+
@classmethod
|
|
340
|
+
def _validate_column_spec(cls, name: str, spec: Dict[str, Any]) -> None:
|
|
341
|
+
"""Check integrity of user-supplied Column spec
|
|
342
|
+
|
|
343
|
+
We unfortunately can't use something like jsonschema for validation, because this isn't strictly a JSON schema
|
|
344
|
+
(on account of containing Python Callables or Exprs).
|
|
345
|
+
"""
|
|
346
|
+
assert isinstance(spec, dict)
|
|
347
|
+
valid_keys = {'type', 'value', 'stored'}
|
|
348
|
+
has_type = False
|
|
349
|
+
for k in spec.keys():
|
|
350
|
+
if k not in valid_keys:
|
|
351
|
+
raise excs.Error(f'Column {name}: invalid key {k!r}')
|
|
352
|
+
|
|
353
|
+
if 'type' in spec:
|
|
354
|
+
has_type = True
|
|
355
|
+
if not isinstance(spec['type'], ts.ColumnType):
|
|
356
|
+
raise excs.Error(f'Column {name}: "type" must be a ColumnType, got {spec["type"]}')
|
|
357
|
+
|
|
358
|
+
if 'value' in spec:
|
|
359
|
+
value_spec = spec['value']
|
|
360
|
+
value_expr = exprs.Expr.from_object(value_spec)
|
|
361
|
+
if value_expr is None:
|
|
362
|
+
# needs to be a Callable
|
|
363
|
+
if not isinstance(value_spec, Callable):
|
|
364
|
+
raise excs.Error(
|
|
365
|
+
f'Column {name}: value needs to be either a Pixeltable expression or a Callable, '
|
|
366
|
+
f'but it is a {type(value_spec)}')
|
|
367
|
+
if 'type' not in spec:
|
|
368
|
+
raise excs.Error(f'Column {name}: "type" is required if value is a Callable')
|
|
369
|
+
else:
|
|
370
|
+
has_type = True
|
|
371
|
+
if 'type' in spec:
|
|
372
|
+
raise excs.Error(f'Column {name}: "type" is redundant if value is a Pixeltable expression')
|
|
373
|
+
|
|
374
|
+
if 'stored' in spec and not isinstance(spec['stored'], bool):
|
|
375
|
+
raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
|
|
376
|
+
if not has_type:
|
|
377
|
+
raise excs.Error(f'Column {name}: "type" is required')
|
|
378
|
+
|
|
379
|
+
@classmethod
|
|
380
|
+
def _create_columns(cls, schema: Dict[str, Any]) -> List[Column]:
|
|
381
|
+
"""Construct list of Columns, given schema"""
|
|
382
|
+
columns: List[Column] = []
|
|
383
|
+
for name, spec in schema.items():
|
|
384
|
+
col_type: Optional[ts.ColumnType] = None
|
|
385
|
+
value_expr: Optional[exprs.Expr] = None
|
|
386
|
+
stored: Optional[bool] = None
|
|
387
|
+
primary_key: Optional[bool] = None
|
|
388
|
+
|
|
389
|
+
if isinstance(spec, ts.ColumnType):
|
|
390
|
+
# TODO: create copy
|
|
391
|
+
col_type = spec
|
|
392
|
+
elif isinstance(spec, exprs.Expr):
|
|
393
|
+
# create copy so we can modify it
|
|
394
|
+
value_expr = spec.copy()
|
|
395
|
+
elif isinstance(spec, Callable):
|
|
396
|
+
raise excs.Error((
|
|
397
|
+
f'Column {name} computed with a Callable: specify using a dictionary with '
|
|
398
|
+
f'the "value" and "type" keys (e.g., "{name}": {{"value": <Callable>, "type": IntType()}})'
|
|
399
|
+
))
|
|
400
|
+
elif isinstance(spec, dict):
|
|
401
|
+
cls._validate_column_spec(name, spec)
|
|
402
|
+
col_type = spec.get('type')
|
|
403
|
+
value_expr = spec.get('value')
|
|
404
|
+
if value_expr is not None and isinstance(value_expr, exprs.Expr):
|
|
405
|
+
# create copy so we can modify it
|
|
406
|
+
value_expr = value_expr.copy()
|
|
407
|
+
stored = spec.get('stored')
|
|
408
|
+
primary_key = spec.get('primary_key')
|
|
409
|
+
|
|
410
|
+
column = Column(
|
|
411
|
+
name, col_type=col_type, computed_with=value_expr, stored=stored, is_pk=primary_key)
|
|
412
|
+
columns.append(column)
|
|
413
|
+
return columns
|
|
414
|
+
|
|
415
|
+
@classmethod
|
|
416
|
+
def _verify_column(cls, col: Column, existing_column_names: Set[str]) -> None:
|
|
417
|
+
"""Check integrity of user-supplied Column and supply defaults"""
|
|
418
|
+
if is_system_column_name(col.name):
|
|
419
|
+
raise excs.Error(f'Column name {col.name} is reserved')
|
|
420
|
+
if not is_valid_identifier(col.name):
|
|
421
|
+
raise excs.Error(f"Invalid column name: '{col.name}'")
|
|
422
|
+
if col.name in existing_column_names:
|
|
423
|
+
raise excs.Error(f'Duplicate column name: {col.name}')
|
|
424
|
+
if col.stored is False and not (col.is_computed and col.col_type.is_image_type()):
|
|
425
|
+
raise excs.Error(f'Column {col.name}: stored={col.stored} only applies to computed image columns')
|
|
426
|
+
if col.stored is False and not (col.col_type.is_image_type() and not col.has_window_fn_call()):
|
|
427
|
+
raise excs.Error((
|
|
428
|
+
f'Column {col.name}: stored={col.stored} is not valid for image columns computed with a streaming '
|
|
429
|
+
f'function'))
|
|
430
|
+
if col.stored is None:
|
|
431
|
+
col.stored = not (col.is_computed and col.col_type.is_image_type() and not col.has_window_fn_call())
|
|
432
|
+
|
|
433
|
+
@classmethod
|
|
434
|
+
def _verify_schema(cls, schema: List[Column]) -> None:
|
|
435
|
+
"""Check integrity of user-supplied schema and set defaults"""
|
|
436
|
+
column_names: Set[str] = set()
|
|
437
|
+
for col in schema:
|
|
438
|
+
cls._verify_column(col, column_names)
|
|
439
|
+
column_names.add(col.name)
|
|
440
|
+
|
|
441
|
+
def drop_column(self, name: str) -> None:
|
|
442
|
+
"""Drop a column from the table.
|
|
443
|
+
|
|
444
|
+
Args:
|
|
445
|
+
name: The name of the column to drop.
|
|
446
|
+
|
|
447
|
+
Raises:
|
|
448
|
+
Error: If the column does not exist or if it is referenced by a computed column.
|
|
449
|
+
|
|
450
|
+
Examples:
|
|
451
|
+
Drop column ``factorial``:
|
|
452
|
+
|
|
453
|
+
>>> tbl.drop_column('factorial')
|
|
454
|
+
"""
|
|
455
|
+
self._check_is_dropped()
|
|
456
|
+
self.tbl_version_path.tbl_version.drop_column(name)
|
|
457
|
+
|
|
458
|
+
def rename_column(self, old_name: str, new_name: str) -> None:
|
|
459
|
+
"""Rename a column.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
old_name: The current name of the column.
|
|
463
|
+
new_name: The new name of the column.
|
|
464
|
+
|
|
465
|
+
Raises:
|
|
466
|
+
Error: If the column does not exist or if the new name is invalid or already exists.
|
|
467
|
+
|
|
468
|
+
Examples:
|
|
469
|
+
Rename column ``factorial`` to ``fac``:
|
|
470
|
+
|
|
471
|
+
>>> tbl.rename_column('factorial', 'fac')
|
|
472
|
+
"""
|
|
473
|
+
self._check_is_dropped()
|
|
474
|
+
self.tbl_version_path.tbl_version.rename_column(old_name, new_name)
|
|
475
|
+
|
|
476
|
+
def add_embedding_index(
|
|
477
|
+
self, col_name: str, *, idx_name: Optional[str] = None,
|
|
478
|
+
text_embed: Optional[pixeltable.Function] = None, img_embed: Optional[pixeltable.Function] = None,
|
|
479
|
+
metric: str = 'cosine'
|
|
480
|
+
) -> None:
|
|
481
|
+
"""Add an index to the table.
|
|
482
|
+
Args:
|
|
483
|
+
col_name: name of column to index
|
|
484
|
+
idx_name: name of index, which needs to be unique for the table; if not provided, a name will be generated
|
|
485
|
+
text_embed: function to embed text; required if the column is a text column
|
|
486
|
+
img_embed: function to embed images; required if the column is an image column
|
|
487
|
+
metric: distance metric to use for the index; one of 'cosine', 'ip', 'l2'; default is 'cosine'
|
|
488
|
+
|
|
489
|
+
Raises:
|
|
490
|
+
Error: If an index with that name already exists for the table or if the column does not exist.
|
|
491
|
+
|
|
492
|
+
Examples:
|
|
493
|
+
Add an index to the ``img`` column:
|
|
494
|
+
|
|
495
|
+
>>> tbl.add_embedding_index('img', img_embed=...)
|
|
496
|
+
|
|
497
|
+
Add another index to the ``img`` column, using the inner product as the distance metric,
|
|
498
|
+
and with a specific name; ``text_embed`` is also specified in order to search with text:
|
|
499
|
+
|
|
500
|
+
>>> tbl.add_embedding_index(
|
|
501
|
+
'img', idx_name='clip_idx', img_embed=..., text_embed=...text_embed..., metric='ip')
|
|
502
|
+
"""
|
|
503
|
+
if self.tbl_version_path.is_snapshot():
|
|
504
|
+
raise excs.Error('Cannot add an index to a snapshot')
|
|
505
|
+
self._check_is_dropped()
|
|
506
|
+
col = self.tbl_version_path.get_column(col_name, include_bases=True)
|
|
507
|
+
if col is None:
|
|
508
|
+
raise excs.Error(f'Column {col_name} unknown')
|
|
509
|
+
if idx_name is not None and idx_name in self.tbl_version_path.tbl_version.idxs_by_name:
|
|
510
|
+
raise excs.Error(f'Duplicate index name: {idx_name}')
|
|
511
|
+
from pixeltable.index import EmbeddingIndex
|
|
512
|
+
# create the EmbeddingIndex instance to verify args
|
|
513
|
+
idx = EmbeddingIndex(col, metric=metric, text_embed=text_embed, img_embed=img_embed)
|
|
514
|
+
status = self.tbl_version_path.tbl_version.add_index(col, idx_name=idx_name, idx=idx)
|
|
515
|
+
# TODO: how to deal with exceptions here? drop the index and raise?
|
|
516
|
+
|
|
517
|
+
def drop_index(self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None) -> None:
|
|
518
|
+
"""Drop an index from the table.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
column_name: The name of the column whose index to drop. Invalid if the column has multiple indices.
|
|
522
|
+
idx_name: The name of the index to drop.
|
|
523
|
+
|
|
524
|
+
Raises:
|
|
525
|
+
Error: If the index does not exist.
|
|
526
|
+
|
|
527
|
+
Examples:
|
|
528
|
+
Drop index on the ``img`` column:
|
|
529
|
+
|
|
530
|
+
>>> tbl.drop_index(column_name='img')
|
|
531
|
+
"""
|
|
532
|
+
if self.tbl_version_path.is_snapshot():
|
|
533
|
+
raise excs.Error('Cannot drop an index from a snapshot')
|
|
534
|
+
self._check_is_dropped()
|
|
535
|
+
if (column_name is None) == (idx_name is None):
|
|
536
|
+
raise excs.Error('Exactly one of column_name or idx_name must be provided')
|
|
537
|
+
tbl_version = self.tbl_version_path.tbl_version
|
|
538
|
+
|
|
539
|
+
if idx_name is not None:
|
|
540
|
+
if idx_name not in tbl_version.idxs_by_name:
|
|
541
|
+
raise excs.Error(f'Index {idx_name} does not exist')
|
|
542
|
+
idx_id = tbl_version.idxs_by_name[idx_name].id
|
|
543
|
+
else:
|
|
544
|
+
col = self.tbl_version_path.get_column(column_name, include_bases=True)
|
|
545
|
+
if col is None:
|
|
546
|
+
raise excs.Error(f'Column {column_name} unknown')
|
|
547
|
+
if col.tbl.id != tbl_version.id:
|
|
548
|
+
raise excs.Error(
|
|
549
|
+
f'Column {column_name}: cannot drop index from column that belongs to base ({col.tbl.name})')
|
|
550
|
+
idx_ids = [info.id for info in tbl_version.idxs_by_name.values() if info.col.id == col.id]
|
|
551
|
+
if len(idx_ids) == 0:
|
|
552
|
+
raise excs.Error(f'Column {column_name} does not have an index')
|
|
553
|
+
if len(idx_ids) > 1:
|
|
554
|
+
raise excs.Error(f'Column {column_name} has multiple indices; specify idx_name instead')
|
|
555
|
+
idx_id = idx_ids[0]
|
|
556
|
+
self.tbl_version_path.tbl_version.drop_index(idx_id)
|
|
557
|
+
|
|
558
|
+
def update(
|
|
559
|
+
self, value_spec: dict[str, Any], where: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
|
|
560
|
+
) -> UpdateStatus:
|
|
561
|
+
"""Update rows in this table.
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
value_spec: a dictionary mapping column names to literal values or Pixeltable expressions.
|
|
565
|
+
where: a Predicate to filter rows to update.
|
|
566
|
+
cascade: if True, also update all computed columns that transitively depend on the updated columns.
|
|
567
|
+
|
|
568
|
+
Examples:
|
|
569
|
+
Set column `int_col` to 1 for all rows:
|
|
570
|
+
|
|
571
|
+
>>> tbl.update({'int_col': 1})
|
|
572
|
+
|
|
573
|
+
Set column `int_col` to 1 for all rows where `int_col` is 0:
|
|
574
|
+
|
|
575
|
+
>>> tbl.update({'int_col': 1}, where=tbl.int_col == 0)
|
|
576
|
+
|
|
577
|
+
Set `int_col` to the value of `other_int_col` + 1:
|
|
578
|
+
|
|
579
|
+
>>> tbl.update({'int_col': tbl.other_int_col + 1})
|
|
580
|
+
|
|
581
|
+
Increment `int_col` by 1 for all rows where `int_col` is 0:
|
|
582
|
+
|
|
583
|
+
>>> tbl.update({'int_col': tbl.int_col + 1}, where=tbl.int_col == 0)
|
|
584
|
+
"""
|
|
585
|
+
if self.tbl_version_path.is_snapshot():
|
|
586
|
+
raise excs.Error('Cannot update a snapshot')
|
|
587
|
+
self._check_is_dropped()
|
|
588
|
+
|
|
589
|
+
update_spec = self._validate_update_spec(value_spec, allow_pk=False, allow_exprs=True)
|
|
590
|
+
from pixeltable.plan import Planner
|
|
591
|
+
if where is not None:
|
|
592
|
+
if not isinstance(where, exprs.Predicate):
|
|
593
|
+
raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
|
|
594
|
+
analysis_info = Planner.analyze(self.tbl_version_path, where)
|
|
595
|
+
# for now we require that the updated rows can be identified via SQL, rather than via a Python filter
|
|
596
|
+
if analysis_info.filter is not None:
|
|
597
|
+
raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
|
|
598
|
+
|
|
599
|
+
return self.tbl_version_path.tbl_version.update(update_spec, where, cascade)
|
|
600
|
+
|
|
601
|
+
def batch_update(self, rows: Iterable[dict[str, Any]], cascade: bool = True) -> UpdateStatus:
|
|
602
|
+
"""Update rows in this table.
|
|
603
|
+
|
|
604
|
+
Args:
|
|
605
|
+
rows: an Iterable of dictionaries containing values for the updated columns plus values for the primary key
|
|
606
|
+
columns.
|
|
607
|
+
cascade: if True, also update all computed columns that transitively depend on the updated columns.
|
|
608
|
+
|
|
609
|
+
Examples:
|
|
610
|
+
Update the 'name' and 'age' columns for the rows with ids 1 and 2 (assuming 'id' is the primary key):
|
|
611
|
+
|
|
612
|
+
>>> tbl.update([{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 2, 'name': 'Bob', 'age': 40}])
|
|
613
|
+
"""
|
|
614
|
+
if self.tbl_version_path.is_snapshot():
|
|
615
|
+
raise excs.Error('Cannot update a snapshot')
|
|
616
|
+
self._check_is_dropped()
|
|
617
|
+
|
|
618
|
+
row_updates: List[Dict[Column, exprs.Expr]] = []
|
|
619
|
+
pk_col_names = set(c.name for c in self.tbl_version_path.tbl_version.primary_key_columns())
|
|
620
|
+
|
|
621
|
+
# pseudo-column _rowid: contains the rowid of the row to update and can be used instead of the primary key
|
|
622
|
+
has_rowid = self.ROWID_COLUMN_NAME in rows[0]
|
|
623
|
+
rowids: list[Tuple[int, ...]] = []
|
|
624
|
+
if len(pk_col_names) == 0 and not has_rowid:
|
|
625
|
+
raise excs.Error('Table must have primary key for batch update')
|
|
626
|
+
|
|
627
|
+
for row_spec in rows:
|
|
628
|
+
col_vals = self._validate_update_spec(row_spec, allow_pk=not has_rowid, allow_exprs=False)
|
|
629
|
+
if has_rowid:
|
|
630
|
+
# we expect the _rowid column to be present for each row
|
|
631
|
+
assert self.ROWID_COLUMN_NAME in row_spec
|
|
632
|
+
rowids.append(row_spec[self.ROWID_COLUMN_NAME])
|
|
633
|
+
else:
|
|
634
|
+
col_names = set(col.name for col in col_vals.keys())
|
|
635
|
+
if any(pk_col_name not in col_names for pk_col_name in pk_col_names):
|
|
636
|
+
missing_cols = pk_col_names - set(col.name for col in col_vals.keys())
|
|
637
|
+
raise excs.Error(f'Primary key columns ({", ".join(missing_cols)}) missing in {row_spec}')
|
|
638
|
+
row_updates.append(col_vals)
|
|
639
|
+
return self.tbl_version_path.tbl_version.batch_update(row_updates, rowids, cascade)
|
|
640
|
+
|
|
641
|
+
def _validate_update_spec(
|
|
642
|
+
self, value_spec: dict[str, Any], allow_pk: bool, allow_exprs: bool
|
|
643
|
+
) -> dict[Column, 'pixeltable.exprs.Expr']:
|
|
644
|
+
from pixeltable import exprs
|
|
645
|
+
update_targets: dict[Column, exprs.Expr] = {}
|
|
646
|
+
for col_name, val in value_spec.items():
|
|
647
|
+
if not isinstance(col_name, str):
|
|
648
|
+
raise excs.Error(f'Update specification: dict key must be column name, got {col_name!r}')
|
|
649
|
+
if col_name == self.ROWID_COLUMN_NAME:
|
|
650
|
+
# ignore pseudo-column _rowid
|
|
651
|
+
continue
|
|
652
|
+
col = self.tbl_version_path.get_column(col_name, include_bases=False)
|
|
653
|
+
if col is None:
|
|
654
|
+
# TODO: return more informative error if this is trying to update a base column
|
|
655
|
+
raise excs.Error(f'Column {col_name} unknown')
|
|
656
|
+
if col.is_computed:
|
|
657
|
+
raise excs.Error(f'Column {col_name} is computed and cannot be updated')
|
|
658
|
+
if col.is_pk and not allow_pk:
|
|
659
|
+
raise excs.Error(f'Column {col_name} is a primary key column and cannot be updated')
|
|
660
|
+
if col.col_type.is_media_type():
|
|
661
|
+
raise excs.Error(f'Column {col_name} has type image/video/audio/document and cannot be updated')
|
|
662
|
+
|
|
663
|
+
# make sure that the value is compatible with the column type
|
|
664
|
+
try:
|
|
665
|
+
# check if this is a literal
|
|
666
|
+
value_expr = exprs.Literal(val, col_type=col.col_type)
|
|
667
|
+
except TypeError:
|
|
668
|
+
if not allow_exprs:
|
|
669
|
+
raise excs.Error(
|
|
670
|
+
f'Column {col_name}: value {val!r} is not a valid literal for this column '
|
|
671
|
+
f'(expected {col.col_type})')
|
|
672
|
+
# it's not a literal, let's try to create an expr from it
|
|
673
|
+
value_expr = exprs.Expr.from_object(val)
|
|
674
|
+
if value_expr is None:
|
|
675
|
+
raise excs.Error(f'Column {col_name}: value {val!r} is not a recognized literal or expression')
|
|
676
|
+
if not col.col_type.matches(value_expr.col_type):
|
|
677
|
+
raise excs.Error((
|
|
678
|
+
f'Type of value {val!r} ({value_expr.col_type}) is not compatible with the type of column '
|
|
679
|
+
f'{col_name} ({col.col_type})'
|
|
680
|
+
))
|
|
681
|
+
update_targets[col] = value_expr
|
|
682
|
+
|
|
683
|
+
return update_targets
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
def revert(self) -> None:
|
|
687
|
+
"""Reverts the table to the previous version.
|
|
688
|
+
|
|
689
|
+
.. warning::
|
|
690
|
+
This operation is irreversible.
|
|
691
|
+
"""
|
|
692
|
+
if self.tbl_version_path.is_snapshot():
|
|
693
|
+
raise excs.Error('Cannot revert a snapshot')
|
|
694
|
+
self._check_is_dropped()
|
|
695
|
+
self.tbl_version_path.tbl_version.revert()
|