pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +20 -9
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/column.py +23 -7
- pixeltable/catalog/insertable_table.py +32 -19
- pixeltable/catalog/table.py +210 -20
- pixeltable/catalog/table_version.py +272 -111
- pixeltable/catalog/table_version_path.py +6 -1
- pixeltable/dataframe.py +184 -110
- pixeltable/datatransfer/__init__.py +1 -0
- pixeltable/datatransfer/label_studio.py +526 -0
- pixeltable/datatransfer/remote.py +113 -0
- pixeltable/env.py +213 -79
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/data_row_batch.py +6 -7
- pixeltable/exec/expr_eval_node.py +28 -28
- pixeltable/exec/sql_scan_node.py +7 -6
- pixeltable/exprs/__init__.py +4 -3
- pixeltable/exprs/column_ref.py +11 -2
- pixeltable/exprs/comparison.py +39 -1
- pixeltable/exprs/data_row.py +7 -0
- pixeltable/exprs/expr.py +26 -19
- pixeltable/exprs/function_call.py +17 -18
- pixeltable/exprs/globals.py +14 -2
- pixeltable/exprs/image_member_access.py +9 -28
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +13 -11
- pixeltable/exprs/inline_dict.py +15 -13
- pixeltable/exprs/row_builder.py +7 -1
- pixeltable/exprs/similarity_expr.py +67 -0
- pixeltable/ext/functions/whisperx.py +30 -0
- pixeltable/ext/functions/yolox.py +16 -0
- pixeltable/func/__init__.py +0 -2
- pixeltable/func/aggregate_function.py +5 -2
- pixeltable/func/callable_function.py +57 -13
- pixeltable/func/expr_template_function.py +14 -3
- pixeltable/func/function.py +35 -4
- pixeltable/func/signature.py +5 -15
- pixeltable/func/udf.py +8 -12
- pixeltable/functions/fireworks.py +9 -4
- pixeltable/functions/huggingface.py +48 -5
- pixeltable/functions/openai.py +49 -11
- pixeltable/functions/pil/image.py +61 -64
- pixeltable/functions/together.py +32 -6
- pixeltable/functions/util.py +0 -43
- pixeltable/functions/video.py +46 -8
- pixeltable/globals.py +443 -0
- pixeltable/index/__init__.py +1 -0
- pixeltable/index/base.py +9 -2
- pixeltable/index/btree.py +54 -0
- pixeltable/index/embedding_index.py +91 -15
- pixeltable/io/__init__.py +4 -0
- pixeltable/io/globals.py +59 -0
- pixeltable/{utils → io}/hf_datasets.py +48 -17
- pixeltable/io/pandas.py +148 -0
- pixeltable/{utils → io}/parquet.py +58 -33
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/iterators/base.py +8 -4
- pixeltable/iterators/document.py +225 -93
- pixeltable/iterators/video.py +16 -9
- pixeltable/metadata/__init__.py +8 -4
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/converters/convert_14.py +13 -0
- pixeltable/metadata/converters/convert_15.py +29 -0
- pixeltable/metadata/converters/util.py +63 -0
- pixeltable/metadata/schema.py +12 -6
- pixeltable/plan.py +11 -24
- pixeltable/store.py +16 -23
- pixeltable/tool/create_test_db_dump.py +49 -14
- pixeltable/type_system.py +27 -58
- pixeltable/utils/coco.py +94 -0
- pixeltable/utils/documents.py +42 -12
- pixeltable/utils/http_server.py +70 -0
- pixeltable-0.2.7.dist-info/METADATA +137 -0
- pixeltable-0.2.7.dist-info/RECORD +126 -0
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
- pixeltable/client.py +0 -600
- pixeltable/exprs/image_similarity_predicate.py +0 -58
- pixeltable/func/batched_function.py +0 -53
- pixeltable/func/nos_function.py +0 -202
- pixeltable/tests/conftest.py +0 -171
- pixeltable/tests/ext/test_yolox.py +0 -21
- pixeltable/tests/functions/test_fireworks.py +0 -43
- pixeltable/tests/functions/test_functions.py +0 -60
- pixeltable/tests/functions/test_huggingface.py +0 -158
- pixeltable/tests/functions/test_openai.py +0 -162
- pixeltable/tests/functions/test_together.py +0 -112
- pixeltable/tests/test_audio.py +0 -65
- pixeltable/tests/test_catalog.py +0 -27
- pixeltable/tests/test_client.py +0 -21
- pixeltable/tests/test_component_view.py +0 -379
- pixeltable/tests/test_dataframe.py +0 -440
- pixeltable/tests/test_dirs.py +0 -107
- pixeltable/tests/test_document.py +0 -120
- pixeltable/tests/test_exprs.py +0 -802
- pixeltable/tests/test_function.py +0 -332
- pixeltable/tests/test_index.py +0 -138
- pixeltable/tests/test_migration.py +0 -44
- pixeltable/tests/test_nos.py +0 -54
- pixeltable/tests/test_snapshot.py +0 -231
- pixeltable/tests/test_table.py +0 -1343
- pixeltable/tests/test_transactional_directory.py +0 -42
- pixeltable/tests/test_types.py +0 -52
- pixeltable/tests/test_video.py +0 -159
- pixeltable/tests/test_view.py +0 -535
- pixeltable/tests/utils.py +0 -442
- pixeltable/utils/clip.py +0 -18
- pixeltable-0.2.5.dist-info/METADATA +0 -128
- pixeltable-0.2.5.dist-info/RECORD +0 -139
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
pixeltable/__init__.py
CHANGED
|
@@ -1,18 +1,32 @@
|
|
|
1
1
|
from .catalog import Column, Table, InsertableTable, View
|
|
2
|
-
from .client import Client
|
|
3
2
|
from .dataframe import DataFrame
|
|
3
|
+
from .datatransfer import Remote
|
|
4
|
+
from .catalog import Column, Table, InsertableTable, View
|
|
4
5
|
from .exceptions import Error, Error
|
|
5
6
|
from .exprs import RELATIVE_PATH_ROOT
|
|
6
7
|
from .func import Function, udf, uda, Aggregator, expr_udf
|
|
7
|
-
from .
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
from .globals import *
|
|
9
|
+
from .type_system import (
|
|
10
|
+
ColumnType,
|
|
11
|
+
StringType,
|
|
12
|
+
IntType,
|
|
13
|
+
FloatType,
|
|
14
|
+
BoolType,
|
|
15
|
+
TimestampType,
|
|
16
|
+
JsonType,
|
|
17
|
+
ArrayType,
|
|
18
|
+
ImageType,
|
|
19
|
+
VideoType,
|
|
20
|
+
AudioType,
|
|
21
|
+
DocumentType,
|
|
22
|
+
)
|
|
10
23
|
from .utils.help import help
|
|
24
|
+
|
|
11
25
|
# noinspection PyUnresolvedReferences
|
|
12
|
-
from . import functions
|
|
26
|
+
from . import functions, io, iterators
|
|
27
|
+
from .__version__ import __version__, __version_tuple__
|
|
13
28
|
|
|
14
29
|
__all__ = [
|
|
15
|
-
'Client',
|
|
16
30
|
'DataFrame',
|
|
17
31
|
'Column',
|
|
18
32
|
'Table',
|
|
@@ -39,6 +53,3 @@ __all__ = [
|
|
|
39
53
|
'uda',
|
|
40
54
|
'expr_udf',
|
|
41
55
|
]
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
pixeltable/catalog/column.py
CHANGED
|
@@ -5,8 +5,8 @@ from typing import Optional, Union, Callable, Set
|
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
import pixeltable.exceptions as excs
|
|
9
|
+
import pixeltable.type_system as ts
|
|
10
10
|
from .globals import is_valid_identifier
|
|
11
11
|
|
|
12
12
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -18,11 +18,12 @@ class Column:
|
|
|
18
18
|
table/view.
|
|
19
19
|
"""
|
|
20
20
|
def __init__(
|
|
21
|
-
self, name: Optional[str], col_type: Optional[ColumnType] = None,
|
|
21
|
+
self, name: Optional[str], col_type: Optional[ts.ColumnType] = None,
|
|
22
22
|
computed_with: Optional[Union['Expr', Callable]] = None,
|
|
23
23
|
is_pk: bool = False, stored: Optional[bool] = None,
|
|
24
24
|
col_id: Optional[int] = None, schema_version_add: Optional[int] = None,
|
|
25
|
-
schema_version_drop: Optional[int] = None, sa_col_type: Optional[sql.sqltypes.TypeEngine] = None
|
|
25
|
+
schema_version_drop: Optional[int] = None, sa_col_type: Optional[sql.sqltypes.TypeEngine] = None,
|
|
26
|
+
records_errors: Optional[bool] = None
|
|
26
27
|
):
|
|
27
28
|
"""Column constructor.
|
|
28
29
|
|
|
@@ -80,12 +81,19 @@ class Column:
|
|
|
80
81
|
assert self.col_type is not None
|
|
81
82
|
|
|
82
83
|
self.stored = stored
|
|
83
|
-
self.dependent_cols:
|
|
84
|
+
self.dependent_cols: set[Column] = set() # cols with value_exprs that reference us; set by TableVersion
|
|
84
85
|
self.id = col_id
|
|
85
86
|
self.is_pk = is_pk
|
|
86
87
|
self.schema_version_add = schema_version_add
|
|
87
88
|
self.schema_version_drop = schema_version_drop
|
|
88
89
|
|
|
90
|
+
# stored_proxy may be set later if this is a non-stored column.
|
|
91
|
+
# if col1.stored_proxy == col2, then also col1 == col2.proxy_base.
|
|
92
|
+
self.stored_proxy: Optional[Column] = None
|
|
93
|
+
self.proxy_base: Optional[Column] = None
|
|
94
|
+
|
|
95
|
+
self._records_errors = records_errors
|
|
96
|
+
|
|
89
97
|
# column in the stored table for the values of this Column
|
|
90
98
|
self.sa_col: Optional[sql.schema.Column] = None
|
|
91
99
|
self.sa_col_type = sa_col_type
|
|
@@ -93,6 +101,7 @@ class Column:
|
|
|
93
101
|
# computed cols also have storage columns for the exception string and type
|
|
94
102
|
self.sa_errormsg_col: Optional[sql.schema.Column] = None
|
|
95
103
|
self.sa_errortype_col: Optional[sql.schema.Column] = None
|
|
104
|
+
|
|
96
105
|
from .table_version import TableVersion
|
|
97
106
|
self.tbl: Optional[TableVersion] = None # set by owning TableVersion
|
|
98
107
|
|
|
@@ -114,6 +123,10 @@ class Column:
|
|
|
114
123
|
l = list(self.value_expr.subexprs(filter=lambda e: isinstance(e, exprs.FunctionCall) and e.is_window_fn_call))
|
|
115
124
|
return len(l) > 0
|
|
116
125
|
|
|
126
|
+
def get_idx_info(self) -> dict[str, 'pixeltable.catalog.TableVersion.IndexInfo']:
|
|
127
|
+
assert self.tbl is not None
|
|
128
|
+
return {name: info for name, info in self.tbl.idxs_by_name.items() if info.col == self}
|
|
129
|
+
|
|
117
130
|
@property
|
|
118
131
|
def is_computed(self) -> bool:
|
|
119
132
|
return self.compute_func is not None or self.value_expr is not None
|
|
@@ -127,6 +140,9 @@ class Column:
|
|
|
127
140
|
@property
|
|
128
141
|
def records_errors(self) -> bool:
|
|
129
142
|
"""True if this column also stores error information."""
|
|
143
|
+
# default: record errors for computed and media columns
|
|
144
|
+
if self._records_errors is not None:
|
|
145
|
+
return self._records_errors
|
|
130
146
|
return self.is_stored and (self.is_computed or self.col_type.is_media_type())
|
|
131
147
|
|
|
132
148
|
def source(self) -> None:
|
|
@@ -148,8 +164,8 @@ class Column:
|
|
|
148
164
|
self.store_name(), self.col_type.to_sa_type() if self.sa_col_type is None else self.sa_col_type,
|
|
149
165
|
nullable=True)
|
|
150
166
|
if self.is_computed or self.col_type.is_media_type():
|
|
151
|
-
self.sa_errormsg_col = sql.Column(self.errormsg_store_name(), StringType().to_sa_type(), nullable=True)
|
|
152
|
-
self.sa_errortype_col = sql.Column(self.errortype_store_name(), StringType().to_sa_type(), nullable=True)
|
|
167
|
+
self.sa_errormsg_col = sql.Column(self.errormsg_store_name(), ts.StringType().to_sa_type(), nullable=True)
|
|
168
|
+
self.sa_errortype_col = sql.Column(self.errortype_store_name(), ts.StringType().to_sa_type(), nullable=True)
|
|
153
169
|
|
|
154
170
|
def get_sa_col_type(self) -> sql.sqltypes.TypeEngine:
|
|
155
171
|
return self.col_type.to_sa_type() if self.sa_col_type is None else self.sa_col_type
|
|
@@ -60,25 +60,29 @@ class InsertableTable(Table):
|
|
|
60
60
|
return tbl
|
|
61
61
|
|
|
62
62
|
@overload
|
|
63
|
-
def insert(
|
|
63
|
+
def insert(
|
|
64
|
+
self, rows: Iterable[Dict[str, Any]], /, *, print_stats: bool = False, fail_on_exception: bool = True
|
|
65
|
+
) -> UpdateStatus: ...
|
|
64
66
|
|
|
65
67
|
@overload
|
|
66
|
-
def insert(self, print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any): ...
|
|
68
|
+
def insert(self, *, print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any) -> UpdateStatus: ...
|
|
67
69
|
|
|
68
|
-
def insert(
|
|
69
|
-
|
|
70
|
+
def insert(
|
|
71
|
+
self, rows: Optional[Iterable[dict[str, Any]]] = None, /, *, print_stats: bool = False,
|
|
72
|
+
fail_on_exception: bool = True, **kwargs: Any
|
|
73
|
+
) -> UpdateStatus:
|
|
74
|
+
"""Inserts rows into this table. There are two mutually exclusive call patterns:
|
|
70
75
|
|
|
71
76
|
To insert multiple rows at a time:
|
|
72
|
-
|
|
73
|
-
``insert(rows: List[Dict[str, Any]], print_stats: bool = False, fail_on_exception: bool = True)``
|
|
77
|
+
``insert(rows: Iterable[dict[str, Any]], /, *, print_stats: bool = False, fail_on_exception: bool = True)``
|
|
74
78
|
|
|
75
79
|
To insert just a single row, you can use the more convenient syntax:
|
|
76
|
-
``insert(print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any)``
|
|
80
|
+
``insert(*, print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any)``
|
|
77
81
|
|
|
78
82
|
Args:
|
|
79
83
|
rows: (if inserting multiple rows) A list of rows to insert, each of which is a dictionary mapping column
|
|
80
84
|
names to values.
|
|
81
|
-
kwargs: (if inserting a single row)
|
|
85
|
+
kwargs: (if inserting a single row) Keyword-argument pairs representing column names and values.
|
|
82
86
|
print_stats: If ``True``, print statistics about the cost of computed columns.
|
|
83
87
|
fail_on_exception:
|
|
84
88
|
Determines how exceptions in computed columns and invalid media files (e.g., corrupt images)
|
|
@@ -102,16 +106,27 @@ class InsertableTable(Table):
|
|
|
102
106
|
|
|
103
107
|
>>> tbl.insert(a=1, b=1, c=1)
|
|
104
108
|
"""
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
109
|
+
# The commented code is the intended implementation, with signature (*args, **kwargs).
|
|
110
|
+
# That signature cannot be used currently, due to a present limitation in mkdocs.
|
|
111
|
+
# See: https://github.com/mkdocstrings/mkdocstrings/issues/669
|
|
112
|
+
|
|
113
|
+
# print_stats = kwargs.pop('print_stats', False)
|
|
114
|
+
# fail_on_exception = kwargs.pop('fail_on_exception', True)
|
|
115
|
+
# if len(args) > 0:
|
|
116
|
+
# # There's a positional argument; this means `rows` is expressed as a
|
|
117
|
+
# # list of dicts (multi-insert)
|
|
118
|
+
# rows = list(args[0])
|
|
119
|
+
# else:
|
|
120
|
+
# # No positional argument; this means we're inserting a single row
|
|
121
|
+
# # using kwargs syntax
|
|
122
|
+
# rows = [kwargs]
|
|
123
|
+
|
|
124
|
+
if rows is None:
|
|
114
125
|
rows = [kwargs]
|
|
126
|
+
else:
|
|
127
|
+
rows = list(rows)
|
|
128
|
+
if len(kwargs) > 0:
|
|
129
|
+
raise excs.Error('`kwargs` cannot be specified unless `rows is None`.')
|
|
115
130
|
|
|
116
131
|
if not isinstance(rows, list):
|
|
117
132
|
raise excs.Error('rows must be a list of dictionaries')
|
|
@@ -185,8 +200,6 @@ class InsertableTable(Table):
|
|
|
185
200
|
if not isinstance(where, Predicate):
|
|
186
201
|
raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
|
|
187
202
|
analysis_info = Planner.analyze(self.tbl_version_path, where)
|
|
188
|
-
if analysis_info.similarity_clause is not None:
|
|
189
|
-
raise excs.Error('nearest() cannot be used with delete()')
|
|
190
203
|
# for now we require that the updated rows can be identified via SQL, rather than via a Python filter
|
|
191
204
|
if analysis_info.filter is not None:
|
|
192
205
|
raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
|
pixeltable/catalog/table.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import itertools
|
|
3
4
|
import json
|
|
4
5
|
import logging
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import Union, Any, List, Dict, Optional, Callable, Set, Tuple, Iterable
|
|
7
|
+
from typing import Union, Any, List, Dict, Optional, Callable, Set, Tuple, Iterable, Type
|
|
7
8
|
from uuid import UUID
|
|
8
9
|
|
|
9
10
|
import pandas as pd
|
|
@@ -16,6 +17,7 @@ import pixeltable.exceptions as excs
|
|
|
16
17
|
import pixeltable.exprs as exprs
|
|
17
18
|
import pixeltable.metadata.schema as schema
|
|
18
19
|
import pixeltable.type_system as ts
|
|
20
|
+
import pixeltable.index as index
|
|
19
21
|
from .column import Column
|
|
20
22
|
from .globals import is_valid_identifier, is_system_column_name, UpdateStatus
|
|
21
23
|
from .schema_object import SchemaObject
|
|
@@ -97,27 +99,31 @@ class Table(SchemaObject):
|
|
|
97
99
|
from pixeltable.dataframe import DataFrame
|
|
98
100
|
return DataFrame(self.tbl_version_path).order_by(*items, asc=asc)
|
|
99
101
|
|
|
100
|
-
def
|
|
101
|
-
"""Return
|
|
102
|
-
|
|
102
|
+
def group_by(self, *items: 'exprs.Expr') -> 'pixeltable.dataframe.DataFrame':
|
|
103
|
+
"""Return a DataFrame for this table."""
|
|
104
|
+
from pixeltable.dataframe import DataFrame
|
|
105
|
+
return DataFrame(self.tbl_version_path).group_by(*items)
|
|
106
|
+
|
|
107
|
+
def collect(self) -> 'pixeltable.dataframe.DataFrameResultSet':
|
|
108
|
+
"""Return rows from this table."""
|
|
103
109
|
return self.df().collect()
|
|
104
110
|
|
|
105
111
|
def show(
|
|
106
112
|
self, *args, **kwargs
|
|
107
|
-
) -> 'pixeltable.dataframe.DataFrameResultSet':
|
|
113
|
+
) -> 'pixeltable.dataframe.DataFrameResultSet':
|
|
108
114
|
"""Return rows from this table.
|
|
109
115
|
"""
|
|
110
116
|
return self.df().show(*args, **kwargs)
|
|
111
117
|
|
|
112
118
|
def head(
|
|
113
119
|
self, *args, **kwargs
|
|
114
|
-
) -> 'pixeltable.dataframe.DataFrameResultSet':
|
|
120
|
+
) -> 'pixeltable.dataframe.DataFrameResultSet':
|
|
115
121
|
"""Return the first n rows inserted into this table."""
|
|
116
122
|
return self.df().head(*args, **kwargs)
|
|
117
123
|
|
|
118
124
|
def tail(
|
|
119
125
|
self, *args, **kwargs
|
|
120
|
-
) -> 'pixeltable.dataframe.DataFrameResultSet':
|
|
126
|
+
) -> 'pixeltable.dataframe.DataFrameResultSet':
|
|
121
127
|
"""Return the last n rows inserted into this table."""
|
|
122
128
|
return self.df().tail(*args, **kwargs)
|
|
123
129
|
|
|
@@ -470,13 +476,16 @@ class Table(SchemaObject):
|
|
|
470
476
|
|
|
471
477
|
def add_embedding_index(
|
|
472
478
|
self, col_name: str, *, idx_name: Optional[str] = None,
|
|
473
|
-
text_embed: Optional[pixeltable.Function] = None, img_embed: Optional[pixeltable.Function] = None
|
|
479
|
+
text_embed: Optional[pixeltable.Function] = None, img_embed: Optional[pixeltable.Function] = None,
|
|
480
|
+
metric: str = 'cosine'
|
|
474
481
|
) -> None:
|
|
475
482
|
"""Add an index to the table.
|
|
476
483
|
Args:
|
|
477
484
|
col_name: name of column to index
|
|
478
485
|
idx_name: name of index, which needs to be unique for the table; if not provided, a name will be generated
|
|
479
|
-
|
|
486
|
+
text_embed: function to embed text; required if the column is a text column
|
|
487
|
+
img_embed: function to embed images; required if the column is an image column
|
|
488
|
+
metric: distance metric to use for the index; one of 'cosine', 'ip', 'l2'; default is 'cosine'
|
|
480
489
|
|
|
481
490
|
Raises:
|
|
482
491
|
Error: If an index with that name already exists for the table or if the column does not exist.
|
|
@@ -484,11 +493,13 @@ class Table(SchemaObject):
|
|
|
484
493
|
Examples:
|
|
485
494
|
Add an index to the ``img`` column:
|
|
486
495
|
|
|
487
|
-
>>> tbl.add_embedding_index('img',
|
|
496
|
+
>>> tbl.add_embedding_index('img', img_embed=...)
|
|
488
497
|
|
|
489
|
-
Add another index to the ``img`` column,
|
|
498
|
+
Add another index to the ``img`` column, using the inner product as the distance metric,
|
|
499
|
+
and with a specific name; ``text_embed`` is also specified in order to search with text:
|
|
490
500
|
|
|
491
|
-
>>> tbl.add_embedding_index(
|
|
501
|
+
>>> tbl.add_embedding_index(
|
|
502
|
+
'img', idx_name='clip_idx', img_embed=..., text_embed=...text_embed..., metric='ip')
|
|
492
503
|
"""
|
|
493
504
|
if self.tbl_version_path.is_snapshot():
|
|
494
505
|
raise excs.Error('Cannot add an index to a snapshot')
|
|
@@ -500,10 +511,28 @@ class Table(SchemaObject):
|
|
|
500
511
|
raise excs.Error(f'Duplicate index name: {idx_name}')
|
|
501
512
|
from pixeltable.index import EmbeddingIndex
|
|
502
513
|
# create the EmbeddingIndex instance to verify args
|
|
503
|
-
idx = EmbeddingIndex(col, text_embed=text_embed, img_embed=img_embed)
|
|
514
|
+
idx = EmbeddingIndex(col, metric=metric, text_embed=text_embed, img_embed=img_embed)
|
|
504
515
|
status = self.tbl_version_path.tbl_version.add_index(col, idx_name=idx_name, idx=idx)
|
|
505
516
|
# TODO: how to deal with exceptions here? drop the index and raise?
|
|
506
517
|
|
|
518
|
+
def drop_embedding_index(self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None) -> None:
|
|
519
|
+
"""Drop an embedding index from the table.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
column_name: The name of the column whose embedding index to drop. Invalid if the column has multiple
|
|
523
|
+
embedding indices.
|
|
524
|
+
idx_name: The name of the index to drop.
|
|
525
|
+
|
|
526
|
+
Raises:
|
|
527
|
+
Error: If the index does not exist.
|
|
528
|
+
|
|
529
|
+
Examples:
|
|
530
|
+
Drop embedding index on the ``img`` column:
|
|
531
|
+
|
|
532
|
+
>>> tbl.drop_embedding_index(column_name='img')
|
|
533
|
+
"""
|
|
534
|
+
self._drop_index(column_name=column_name, idx_name=idx_name, _idx_class=index.EmbeddingIndex)
|
|
535
|
+
|
|
507
536
|
def drop_index(self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None) -> None:
|
|
508
537
|
"""Drop an index from the table.
|
|
509
538
|
|
|
@@ -519,6 +548,12 @@ class Table(SchemaObject):
|
|
|
519
548
|
|
|
520
549
|
>>> tbl.drop_index(column_name='img')
|
|
521
550
|
"""
|
|
551
|
+
self._drop_index(column_name=column_name, idx_name=idx_name)
|
|
552
|
+
|
|
553
|
+
def _drop_index(
|
|
554
|
+
self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None,
|
|
555
|
+
_idx_class: Optional[Type[index.IndexBase]] = None
|
|
556
|
+
) -> None:
|
|
522
557
|
if self.tbl_version_path.is_snapshot():
|
|
523
558
|
raise excs.Error('Cannot drop an index from a snapshot')
|
|
524
559
|
self._check_is_dropped()
|
|
@@ -537,12 +572,14 @@ class Table(SchemaObject):
|
|
|
537
572
|
if col.tbl.id != tbl_version.id:
|
|
538
573
|
raise excs.Error(
|
|
539
574
|
f'Column {column_name}: cannot drop index from column that belongs to base ({col.tbl.name})')
|
|
540
|
-
|
|
541
|
-
if
|
|
575
|
+
idx_info = [info for info in tbl_version.idxs_by_name.values() if info.col.id == col.id]
|
|
576
|
+
if _idx_class is not None:
|
|
577
|
+
idx_info = [info for info in idx_info if isinstance(info.idx, _idx_class)]
|
|
578
|
+
if len(idx_info) == 0:
|
|
542
579
|
raise excs.Error(f'Column {column_name} does not have an index')
|
|
543
|
-
if len(
|
|
580
|
+
if len(idx_info) > 1:
|
|
544
581
|
raise excs.Error(f'Column {column_name} has multiple indices; specify idx_name instead')
|
|
545
|
-
idx_id =
|
|
582
|
+
idx_id = idx_info[0].id
|
|
546
583
|
self.tbl_version_path.tbl_version.drop_index(idx_id)
|
|
547
584
|
|
|
548
585
|
def update(
|
|
@@ -582,8 +619,6 @@ class Table(SchemaObject):
|
|
|
582
619
|
if not isinstance(where, exprs.Predicate):
|
|
583
620
|
raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
|
|
584
621
|
analysis_info = Planner.analyze(self.tbl_version_path, where)
|
|
585
|
-
if analysis_info.similarity_clause is not None:
|
|
586
|
-
raise excs.Error('nearest() cannot be used with update()')
|
|
587
622
|
# for now we require that the updated rows can be identified via SQL, rather than via a Python filter
|
|
588
623
|
if analysis_info.filter is not None:
|
|
589
624
|
raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
|
|
@@ -674,7 +709,6 @@ class Table(SchemaObject):
|
|
|
674
709
|
|
|
675
710
|
return update_targets
|
|
676
711
|
|
|
677
|
-
|
|
678
712
|
def revert(self) -> None:
|
|
679
713
|
"""Reverts the table to the previous version.
|
|
680
714
|
|
|
@@ -685,3 +719,159 @@ class Table(SchemaObject):
|
|
|
685
719
|
raise excs.Error('Cannot revert a snapshot')
|
|
686
720
|
self._check_is_dropped()
|
|
687
721
|
self.tbl_version_path.tbl_version.revert()
|
|
722
|
+
|
|
723
|
+
def _link(
|
|
724
|
+
self,
|
|
725
|
+
remote: 'pixeltable.datatransfer.Remote',
|
|
726
|
+
col_mapping: Optional[dict[str, str]] = None
|
|
727
|
+
) -> None:
|
|
728
|
+
"""
|
|
729
|
+
Links the specified `Remote` to this table. Once a remote is linked, it can be synchronized with
|
|
730
|
+
this `Table` by calling [`Table.sync()`]. A record of the link
|
|
731
|
+
is stored in table metadata and will persist across sessions.
|
|
732
|
+
|
|
733
|
+
Args:
|
|
734
|
+
remote (pixeltable.datatransfer.Remote): The `Remote` to link to this table.
|
|
735
|
+
col_mapping: An optional mapping of columns from this `Table` to columns in the `Remote`.
|
|
736
|
+
"""
|
|
737
|
+
# TODO(aaron-siegel): Refactor `col_mapping`
|
|
738
|
+
self._check_is_dropped()
|
|
739
|
+
if remote in self._get_remotes():
|
|
740
|
+
raise excs.Error(f'That remote is already linked to table `{self.get_name()}`: {remote}')
|
|
741
|
+
push_cols = remote.get_export_columns()
|
|
742
|
+
pull_cols = remote.get_import_columns()
|
|
743
|
+
is_col_mapping_user_specified = col_mapping is not None
|
|
744
|
+
if col_mapping is None:
|
|
745
|
+
# Use the identity mapping by default if `col_mapping` is not specified
|
|
746
|
+
col_mapping = {col: col for col in itertools.chain(push_cols.keys(), pull_cols.keys())}
|
|
747
|
+
self._validate_remote(push_cols, pull_cols, col_mapping, is_col_mapping_user_specified)
|
|
748
|
+
_logger.info(f'Linking remote {remote} to table `{self.get_name()}`.')
|
|
749
|
+
self.tbl_version_path.tbl_version.link(remote, col_mapping)
|
|
750
|
+
print(f'Linked remote {remote} to table `{self.get_name()}`.')
|
|
751
|
+
|
|
752
|
+
def unlink(
|
|
753
|
+
self,
|
|
754
|
+
remotes: Optional['pixeltable.datatransfer.Remote' | list['pixeltable.datatransfer.Remote']] = None,
|
|
755
|
+
*,
|
|
756
|
+
delete_remote_data: bool = False,
|
|
757
|
+
ignore_errors: bool = False
|
|
758
|
+
) -> None:
|
|
759
|
+
"""
|
|
760
|
+
Unlinks this table's `Remote`s.
|
|
761
|
+
|
|
762
|
+
Args:
|
|
763
|
+
remotes: If specified, will unlink only the specified `Remote` or list of `Remote`s. If not specified,
|
|
764
|
+
will unlink all of this table's `Remote`s.
|
|
765
|
+
ignore_errors (bool): If `True`, no exception will be thrown if the specified `Remote` is not linked
|
|
766
|
+
to this table.
|
|
767
|
+
delete_remote_data (bool): If `True`, then the remote data source will also be deleted. WARNING: This
|
|
768
|
+
is a destructive operation that will delete data outside Pixeltable, and cannot be undone.
|
|
769
|
+
|
|
770
|
+
"""
|
|
771
|
+
self._check_is_dropped()
|
|
772
|
+
all_remotes = self._get_remotes()
|
|
773
|
+
|
|
774
|
+
if remotes is None:
|
|
775
|
+
remotes = list(all_remotes.keys())
|
|
776
|
+
elif isinstance(remotes, pixeltable.datatransfer.Remote):
|
|
777
|
+
remotes = [remotes]
|
|
778
|
+
|
|
779
|
+
# Validation
|
|
780
|
+
if not ignore_errors:
|
|
781
|
+
for remote in remotes:
|
|
782
|
+
if remote not in all_remotes:
|
|
783
|
+
raise excs.Error(f'Remote {remote} is not linked to table `{self.get_name()}`')
|
|
784
|
+
|
|
785
|
+
for remote in remotes:
|
|
786
|
+
self.tbl_version_path.tbl_version.unlink(remote)
|
|
787
|
+
print(f'Unlinked remote {remote} from table `{self.get_name()}`.')
|
|
788
|
+
if delete_remote_data:
|
|
789
|
+
remote.delete()
|
|
790
|
+
|
|
791
|
+
def _validate_remote(
|
|
792
|
+
self,
|
|
793
|
+
export_cols: dict[str, ts.ColumnType],
|
|
794
|
+
import_cols: dict[str, ts.ColumnType],
|
|
795
|
+
col_mapping: Optional[dict[str, str]],
|
|
796
|
+
is_col_mapping_user_specified: bool
|
|
797
|
+
):
|
|
798
|
+
# Validate names
|
|
799
|
+
t_cols = self.column_names()
|
|
800
|
+
for t_col, r_col in col_mapping.items():
|
|
801
|
+
if t_col not in t_cols:
|
|
802
|
+
if is_col_mapping_user_specified:
|
|
803
|
+
raise excs.Error(
|
|
804
|
+
f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{self.get_name()}` '
|
|
805
|
+
'contains no such column.'
|
|
806
|
+
)
|
|
807
|
+
else:
|
|
808
|
+
raise excs.Error(
|
|
809
|
+
f'Column `{t_col}` does not exist in Table `{self.get_name()}`. Either add a column `{t_col}`, '
|
|
810
|
+
f'or specify a `col_mapping` to associate a different column with the remote field `{r_col}`.'
|
|
811
|
+
)
|
|
812
|
+
if r_col not in export_cols and r_col not in import_cols:
|
|
813
|
+
raise excs.Error(
|
|
814
|
+
f'Column name `{r_col}` appears as a value in `col_mapping`, but the remote '
|
|
815
|
+
f'configuration has no column `{r_col}`.'
|
|
816
|
+
)
|
|
817
|
+
# Validate column specs
|
|
818
|
+
t_col_types = self.column_types()
|
|
819
|
+
for t_col, r_col in col_mapping.items():
|
|
820
|
+
t_col_type = t_col_types[t_col]
|
|
821
|
+
if r_col in export_cols:
|
|
822
|
+
# Validate that the table column can be assigned to the remote column
|
|
823
|
+
r_col_type = export_cols[r_col]
|
|
824
|
+
if not r_col_type.is_supertype_of(t_col_type):
|
|
825
|
+
raise excs.Error(
|
|
826
|
+
f'Column `{t_col}` cannot be exported to remote column `{r_col}` (incompatible types; expecting `{r_col_type}`)'
|
|
827
|
+
)
|
|
828
|
+
if r_col in import_cols:
|
|
829
|
+
# Validate that the remote column can be assigned to the table column
|
|
830
|
+
if self.tbl_version_path.get_column(t_col).is_computed:
|
|
831
|
+
raise excs.Error(
|
|
832
|
+
f'Column `{t_col}` is a computed column, which cannot be populated from a remote column'
|
|
833
|
+
)
|
|
834
|
+
r_col_type = import_cols[r_col]
|
|
835
|
+
if not t_col_type.is_supertype_of(r_col_type):
|
|
836
|
+
raise excs.Error(
|
|
837
|
+
f'Column `{t_col}` cannot be imported from remote column `{r_col}` (incompatible types; expecting `{r_col_type}`)'
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
def _get_remotes(self) -> dict[pixeltable.datatransfer.Remote, dict[str, str]]:
|
|
841
|
+
"""
|
|
842
|
+
Gets a `dict` of all `Remote`s linked to this table.
|
|
843
|
+
"""
|
|
844
|
+
return self.tbl_version_path.tbl_version.get_remotes()
|
|
845
|
+
|
|
846
|
+
def sync(
|
|
847
|
+
self,
|
|
848
|
+
*,
|
|
849
|
+
export_data: bool = True,
|
|
850
|
+
import_data: bool = True
|
|
851
|
+
):
|
|
852
|
+
"""
|
|
853
|
+
Synchronizes this table with its linked `Remote`s.
|
|
854
|
+
|
|
855
|
+
Args:
|
|
856
|
+
export_data: If `True`, data from this table will be exported to the external store during synchronization.
|
|
857
|
+
import_data: If `True`, data from the external store will be imported to this table during synchronization.
|
|
858
|
+
"""
|
|
859
|
+
remotes = self._get_remotes()
|
|
860
|
+
assert len(remotes) <= 1
|
|
861
|
+
|
|
862
|
+
# Validation
|
|
863
|
+
for remote in remotes:
|
|
864
|
+
col_mapping = remotes[remote]
|
|
865
|
+
r_cols = set(col_mapping.values())
|
|
866
|
+
# Validate export/import
|
|
867
|
+
if export_data and not any(col in r_cols for col in remote.get_export_columns()):
|
|
868
|
+
raise excs.Error(
|
|
869
|
+
f'Attempted to sync with export_data=True, but there are no columns to export: {remote}'
|
|
870
|
+
)
|
|
871
|
+
if import_data and not any(col in r_cols for col in remote.get_import_columns()):
|
|
872
|
+
raise excs.Error(
|
|
873
|
+
f'Attempted to sync with import_data=True, but there are no columns to import: {remote}'
|
|
874
|
+
)
|
|
875
|
+
|
|
876
|
+
for remote in remotes:
|
|
877
|
+
remote.sync(self, remotes[remote], export_data=export_data, import_data=import_data)
|