pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +42 -8
- pixeltable/{dataframe.py → _query.py} +470 -206
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -4
- pixeltable/catalog/catalog.py +1785 -432
- pixeltable/catalog/column.py +190 -113
- pixeltable/catalog/dir.py +2 -4
- pixeltable/catalog/globals.py +19 -46
- pixeltable/catalog/insertable_table.py +191 -98
- pixeltable/catalog/path.py +63 -23
- pixeltable/catalog/schema_object.py +11 -15
- pixeltable/catalog/table.py +843 -436
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +978 -657
- pixeltable/catalog/table_version_handle.py +72 -16
- pixeltable/catalog/table_version_path.py +112 -43
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +134 -90
- pixeltable/config.py +134 -22
- pixeltable/env.py +471 -157
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +4 -1
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +11 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +106 -56
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +19 -19
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +351 -84
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +36 -23
- pixeltable/exprs/column_ref.py +213 -89
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +164 -54
- pixeltable/exprs/expr.py +70 -44
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +100 -40
- pixeltable/exprs/globals.py +2 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +18 -32
- pixeltable/exprs/is_null.py +7 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +27 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +167 -67
- pixeltable/exprs/rowid_ref.py +25 -10
- pixeltable/exprs/similarity_expr.py +58 -40
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +17 -11
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +29 -27
- pixeltable/func/signature.py +46 -19
- pixeltable/func/tools.py +31 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +16 -0
- pixeltable/functions/anthropic.py +123 -77
- pixeltable/functions/audio.py +147 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +7 -4
- pixeltable/functions/deepseek.py +35 -43
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +11 -20
- pixeltable/functions/gemini.py +195 -39
- pixeltable/functions/globals.py +142 -14
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1056 -24
- pixeltable/functions/image.py +115 -57
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +28 -13
- pixeltable/functions/math.py +67 -5
- pixeltable/functions/mistralai.py +18 -55
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +20 -13
- pixeltable/functions/openai.py +240 -226
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +4 -4
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +239 -69
- pixeltable/functions/timestamp.py +16 -16
- pixeltable/functions/together.py +24 -84
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1515 -107
- pixeltable/functions/vision.py +8 -8
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +16 -8
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +362 -115
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +28 -22
- pixeltable/index/embedding_index.py +100 -118
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +8 -7
- pixeltable/io/external_store.py +56 -105
- pixeltable/io/fiftyone.py +13 -13
- pixeltable/io/globals.py +31 -30
- pixeltable/io/hf_datasets.py +61 -16
- pixeltable/io/label_studio.py +74 -70
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +21 -12
- pixeltable/io/parquet.py +25 -105
- pixeltable/io/table_data_conduit.py +250 -123
- pixeltable/io/utils.py +4 -4
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +26 -25
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +112 -78
- pixeltable/iterators/image.py +12 -15
- pixeltable/iterators/string.py +11 -4
- pixeltable/iterators/video.py +523 -120
- pixeltable/metadata/__init__.py +14 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_30.py +34 -21
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +20 -31
- pixeltable/metadata/notes.py +9 -0
- pixeltable/metadata/schema.py +140 -53
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +382 -115
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +547 -83
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +257 -59
- pixeltable/store.py +311 -194
- pixeltable/type_system.py +373 -211
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +131 -17
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +6 -6
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +32 -6
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +7 -18
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +86 -48
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +26 -0
- pixeltable/utils/system.py +30 -0
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -40
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable-0.3.14.dist-info/METADATA +0 -434
- pixeltable-0.3.14.dist-info/RECORD +0 -186
- pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/catalog/table.py
CHANGED
|
@@ -2,22 +2,30 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
4
|
import builtins
|
|
5
|
+
import datetime
|
|
5
6
|
import json
|
|
6
7
|
import logging
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional, Union, overload
|
|
9
|
-
|
|
10
|
-
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
11
8
|
from keyword import iskeyword as is_python_keyword
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Iterable, Literal
|
|
12
11
|
from uuid import UUID
|
|
13
12
|
|
|
14
13
|
import pandas as pd
|
|
15
14
|
import sqlalchemy as sql
|
|
15
|
+
from typing_extensions import overload
|
|
16
16
|
|
|
17
17
|
import pixeltable as pxt
|
|
18
18
|
from pixeltable import catalog, env, exceptions as excs, exprs, index, type_system as ts
|
|
19
|
-
from pixeltable.
|
|
19
|
+
from pixeltable.catalog.table_metadata import (
|
|
20
|
+
ColumnMetadata,
|
|
21
|
+
EmbeddingIndexParams,
|
|
22
|
+
IndexMetadata,
|
|
23
|
+
TableMetadata,
|
|
24
|
+
VersionMetadata,
|
|
25
|
+
)
|
|
20
26
|
from pixeltable.metadata import schema
|
|
27
|
+
from pixeltable.metadata.utils import MetadataUtils
|
|
28
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
21
29
|
|
|
22
30
|
from ..exprs import ColumnRef
|
|
23
31
|
from ..utils.description_helper import DescriptionHelper
|
|
@@ -28,13 +36,16 @@ from .globals import (
|
|
|
28
36
|
IfExistsParam,
|
|
29
37
|
IfNotExistsParam,
|
|
30
38
|
MediaValidation,
|
|
31
|
-
UpdateStatus,
|
|
32
39
|
is_system_column_name,
|
|
33
40
|
is_valid_identifier,
|
|
34
41
|
)
|
|
35
42
|
from .schema_object import SchemaObject
|
|
36
43
|
from .table_version_handle import TableVersionHandle
|
|
37
44
|
from .table_version_path import TableVersionPath
|
|
45
|
+
from .update_status import UpdateStatus
|
|
46
|
+
|
|
47
|
+
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
48
|
+
|
|
38
49
|
|
|
39
50
|
if TYPE_CHECKING:
|
|
40
51
|
import torch.utils.data
|
|
@@ -42,6 +53,7 @@ if TYPE_CHECKING:
|
|
|
42
53
|
import pixeltable.plan
|
|
43
54
|
from pixeltable.globals import TableDataSource
|
|
44
55
|
|
|
56
|
+
|
|
45
57
|
_logger = logging.getLogger('pixeltable')
|
|
46
58
|
|
|
47
59
|
|
|
@@ -49,26 +61,34 @@ class Table(SchemaObject):
|
|
|
49
61
|
"""
|
|
50
62
|
A handle to a table, view, or snapshot. This class is the primary interface through which table operations
|
|
51
63
|
(queries, insertions, updates, etc.) are performed in Pixeltable.
|
|
64
|
+
|
|
65
|
+
Every user-invoked operation that runs an ExecNode tree (directly or indirectly) needs to call
|
|
66
|
+
FileCache.emit_eviction_warnings() at the end of the operation.
|
|
52
67
|
"""
|
|
53
68
|
|
|
54
|
-
#
|
|
55
|
-
|
|
69
|
+
# the chain of TableVersions needed to run queries and supply metadata (eg, schema)
|
|
70
|
+
_tbl_version_path: TableVersionPath
|
|
56
71
|
|
|
57
|
-
|
|
58
|
-
|
|
72
|
+
# the physical TableVersion backing this Table; None for pure snapshots
|
|
73
|
+
_tbl_version: TableVersionHandle | None
|
|
59
74
|
|
|
60
75
|
def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath):
|
|
61
76
|
super().__init__(id, name, dir_id)
|
|
62
|
-
self.
|
|
63
|
-
self.
|
|
64
|
-
|
|
65
|
-
# @property
|
|
66
|
-
# def _has_dependents(self) -> bool:
|
|
67
|
-
# """Returns True if this table has any dependent views, or snapshots."""
|
|
68
|
-
# return len(self._get_views(recursive=False)) > 0
|
|
77
|
+
self._tbl_version_path = tbl_version_path
|
|
78
|
+
self._tbl_version = None
|
|
69
79
|
|
|
70
80
|
def _move(self, new_name: str, new_dir_id: UUID) -> None:
|
|
71
|
-
self.
|
|
81
|
+
old_name = self._name
|
|
82
|
+
old_dir_id = self._dir_id
|
|
83
|
+
|
|
84
|
+
cat = catalog.Catalog.get()
|
|
85
|
+
|
|
86
|
+
@cat.register_undo_action
|
|
87
|
+
def _() -> None:
|
|
88
|
+
# TODO: We should really be invalidating the Table instance and forcing a reload.
|
|
89
|
+
self._name = old_name
|
|
90
|
+
self._dir_id = old_dir_id
|
|
91
|
+
|
|
72
92
|
super()._move(new_name, new_dir_id)
|
|
73
93
|
conn = env.Env.get().conn
|
|
74
94
|
stmt = sql.text(
|
|
@@ -81,71 +101,88 @@ class Table(SchemaObject):
|
|
|
81
101
|
)
|
|
82
102
|
conn.execute(stmt, {'new_dir_id': new_dir_id, 'new_name': json.dumps(new_name), 'id': self._id})
|
|
83
103
|
|
|
84
|
-
|
|
104
|
+
# this is duplicated from SchemaObject so that our API docs show the docstring for Table
|
|
105
|
+
def get_metadata(self) -> 'TableMetadata':
|
|
85
106
|
"""
|
|
86
107
|
Retrieves metadata associated with this table.
|
|
87
108
|
|
|
88
109
|
Returns:
|
|
89
|
-
A
|
|
90
|
-
|
|
91
|
-
```python
|
|
92
|
-
{
|
|
93
|
-
'base': None, # If this is a view or snapshot, will contain the name of its base table
|
|
94
|
-
'schema': {
|
|
95
|
-
'col1': StringType(),
|
|
96
|
-
'col2': IntType(),
|
|
97
|
-
},
|
|
98
|
-
'is_replica': False,
|
|
99
|
-
'version': 22,
|
|
100
|
-
'schema_version': 1,
|
|
101
|
-
'comment': '',
|
|
102
|
-
'num_retained_versions': 10,
|
|
103
|
-
'is_view': False,
|
|
104
|
-
'is_snapshot': False,
|
|
105
|
-
'media_validation': 'on_write',
|
|
106
|
-
}
|
|
107
|
-
```
|
|
110
|
+
A [TableMetadata][pixeltable.TableMetadata] instance containing this table's metadata.
|
|
108
111
|
"""
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
112
|
+
from pixeltable.catalog import retry_loop
|
|
113
|
+
|
|
114
|
+
@retry_loop(for_write=False)
|
|
115
|
+
def op() -> 'TableMetadata':
|
|
116
|
+
return self._get_metadata()
|
|
117
|
+
|
|
118
|
+
return op()
|
|
119
|
+
|
|
120
|
+
def _get_metadata(self) -> TableMetadata:
|
|
121
|
+
tvp = self._tbl_version_path
|
|
122
|
+
tv = tvp.tbl_version.get()
|
|
123
|
+
columns = tvp.columns()
|
|
124
|
+
column_info: dict[str, ColumnMetadata] = {}
|
|
125
|
+
for col in columns:
|
|
126
|
+
column_info[col.name] = ColumnMetadata(
|
|
127
|
+
name=col.name,
|
|
128
|
+
type_=col.col_type._to_str(as_schema=True),
|
|
129
|
+
version_added=col.schema_version_add,
|
|
130
|
+
is_stored=col.is_stored,
|
|
131
|
+
is_primary_key=col.is_pk,
|
|
132
|
+
media_validation=col.media_validation.name.lower() if col.media_validation is not None else None, # type: ignore[typeddict-item]
|
|
133
|
+
computed_with=col.value_expr.display_str(inline=False) if col.value_expr is not None else None,
|
|
134
|
+
defined_in=col.get_tbl().name,
|
|
135
|
+
)
|
|
121
136
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
137
|
+
indices = tv.idxs_by_name.values()
|
|
138
|
+
index_info: dict[str, IndexMetadata] = {}
|
|
139
|
+
for info in indices:
|
|
140
|
+
if isinstance(info.idx, index.EmbeddingIndex):
|
|
141
|
+
col_ref = ColumnRef(info.col)
|
|
142
|
+
embedding = info.idx.embeddings[info.col.col_type._type](col_ref)
|
|
143
|
+
index_info[info.name] = IndexMetadata(
|
|
144
|
+
name=info.name,
|
|
145
|
+
columns=[info.col.name],
|
|
146
|
+
index_type='embedding',
|
|
147
|
+
parameters=EmbeddingIndexParams(
|
|
148
|
+
metric=info.idx.metric.name.lower(), # type: ignore[typeddict-item]
|
|
149
|
+
embedding=str(embedding),
|
|
150
|
+
embedding_functions=[str(fn) for fn in info.idx.embeddings.values()],
|
|
151
|
+
),
|
|
152
|
+
)
|
|
126
153
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
154
|
+
return TableMetadata(
|
|
155
|
+
name=self._name,
|
|
156
|
+
path=self._path(),
|
|
157
|
+
columns=column_info,
|
|
158
|
+
indices=index_info,
|
|
159
|
+
is_replica=tv.is_replica,
|
|
160
|
+
is_view=False,
|
|
161
|
+
is_snapshot=False,
|
|
162
|
+
version=self._get_version(),
|
|
163
|
+
version_created=datetime.datetime.fromtimestamp(tv.created_at, tz=datetime.timezone.utc),
|
|
164
|
+
schema_version=tvp.schema_version(),
|
|
165
|
+
comment=self._get_comment(),
|
|
166
|
+
media_validation=self._get_media_validation().name.lower(), # type: ignore[typeddict-item]
|
|
167
|
+
base=None,
|
|
168
|
+
)
|
|
131
169
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
self.
|
|
135
|
-
return self.__tbl_version_path
|
|
170
|
+
def _get_version(self) -> int:
|
|
171
|
+
"""Return the version of this table. Used by tests to ascertain version changes."""
|
|
172
|
+
return self._tbl_version_path.version()
|
|
136
173
|
|
|
137
|
-
def
|
|
138
|
-
|
|
174
|
+
def _get_pxt_uri(self) -> str | None:
|
|
175
|
+
with catalog.Catalog.get().begin_xact(tbl_id=self._id):
|
|
176
|
+
return catalog.Catalog.get().get_additional_md(self._id).get('pxt_uri')
|
|
139
177
|
|
|
140
|
-
def
|
|
141
|
-
|
|
142
|
-
raise excs.Error(f'{self._display_name()} {self._name} has been dropped')
|
|
178
|
+
def __hash__(self) -> int:
|
|
179
|
+
return hash(self._tbl_version_path.tbl_id)
|
|
143
180
|
|
|
144
181
|
def __getattr__(self, name: str) -> 'exprs.ColumnRef':
|
|
145
182
|
"""Return a ColumnRef for the given name."""
|
|
146
183
|
col = self._tbl_version_path.get_column(name)
|
|
147
184
|
if col is None:
|
|
148
|
-
raise AttributeError(f'
|
|
185
|
+
raise AttributeError(f'Unknown column: {name}')
|
|
149
186
|
return ColumnRef(col, reference_tbl=self._tbl_version_path)
|
|
150
187
|
|
|
151
188
|
def __getitem__(self, name: str) -> 'exprs.ColumnRef':
|
|
@@ -163,137 +200,160 @@ class Table(SchemaObject):
|
|
|
163
200
|
Returns:
|
|
164
201
|
A list of view paths.
|
|
165
202
|
"""
|
|
166
|
-
|
|
167
|
-
with env.Env.get().begin_xact():
|
|
168
|
-
return [t._path for t in self._get_views(recursive=recursive)]
|
|
203
|
+
from pixeltable.catalog import retry_loop
|
|
169
204
|
|
|
170
|
-
|
|
205
|
+
# we need retry_loop() here, because we end up loading Tables for the views
|
|
206
|
+
@retry_loop(tbl=self._tbl_version_path, for_write=False)
|
|
207
|
+
def op() -> list[str]:
|
|
208
|
+
return [t._path() for t in self._get_views(recursive=recursive)]
|
|
209
|
+
|
|
210
|
+
return op()
|
|
211
|
+
|
|
212
|
+
def _get_views(self, *, recursive: bool = True, mutable_only: bool = False) -> list['Table']:
|
|
171
213
|
cat = catalog.Catalog.get()
|
|
172
214
|
view_ids = cat.get_view_ids(self._id)
|
|
173
215
|
views = [cat.get_table_by_id(id) for id in view_ids]
|
|
216
|
+
if mutable_only:
|
|
217
|
+
views = [t for t in views if t._tbl_version_path.is_mutable()]
|
|
174
218
|
if recursive:
|
|
175
|
-
views.extend(
|
|
219
|
+
views.extend(t for view in views for t in view._get_views(recursive=True, mutable_only=mutable_only))
|
|
176
220
|
return views
|
|
177
221
|
|
|
178
|
-
def
|
|
179
|
-
"""Return a DataFrame for this table."""
|
|
180
|
-
# local import: avoid circular imports
|
|
181
|
-
from pixeltable.plan import FromClause
|
|
182
|
-
|
|
183
|
-
return pxt.DataFrame(FromClause(tbls=[self._tbl_version_path]))
|
|
184
|
-
|
|
185
|
-
def select(self, *items: Any, **named_items: Any) -> 'pxt.DataFrame':
|
|
222
|
+
def select(self, *items: Any, **named_items: Any) -> 'pxt.Query':
|
|
186
223
|
"""Select columns or expressions from this table.
|
|
187
224
|
|
|
188
|
-
See [`
|
|
225
|
+
See [`Query.select`][pixeltable.Query.select] for more details.
|
|
189
226
|
"""
|
|
190
|
-
|
|
227
|
+
from pixeltable.catalog import Catalog
|
|
228
|
+
from pixeltable.plan import FromClause
|
|
229
|
+
|
|
230
|
+
query = pxt.Query(FromClause(tbls=[self._tbl_version_path]))
|
|
231
|
+
if len(items) == 0 and len(named_items) == 0:
|
|
232
|
+
return query # Select(*); no further processing is necessary
|
|
191
233
|
|
|
192
|
-
|
|
234
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
|
|
235
|
+
return query.select(*items, **named_items)
|
|
236
|
+
|
|
237
|
+
def where(self, pred: 'exprs.Expr') -> 'pxt.Query':
|
|
193
238
|
"""Filter rows from this table based on the expression.
|
|
194
239
|
|
|
195
|
-
See [`
|
|
240
|
+
See [`Query.where`][pixeltable.Query.where] for more details.
|
|
196
241
|
"""
|
|
197
|
-
|
|
242
|
+
from pixeltable.catalog import Catalog
|
|
243
|
+
|
|
244
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
|
|
245
|
+
return self.select().where(pred)
|
|
198
246
|
|
|
199
247
|
def join(
|
|
200
|
-
self,
|
|
201
|
-
|
|
202
|
-
*,
|
|
203
|
-
on: Optional['exprs.Expr'] = None,
|
|
204
|
-
how: 'pixeltable.plan.JoinType.LiteralType' = 'inner',
|
|
205
|
-
) -> 'pxt.DataFrame':
|
|
248
|
+
self, other: 'Table', *, on: 'exprs.Expr' | None = None, how: 'pixeltable.plan.JoinType.LiteralType' = 'inner'
|
|
249
|
+
) -> 'pxt.Query':
|
|
206
250
|
"""Join this table with another table."""
|
|
207
|
-
|
|
251
|
+
from pixeltable.catalog import Catalog
|
|
208
252
|
|
|
209
|
-
|
|
253
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
|
|
254
|
+
return self.select().join(other, on=on, how=how)
|
|
255
|
+
|
|
256
|
+
def order_by(self, *items: 'exprs.Expr', asc: bool = True) -> 'pxt.Query':
|
|
210
257
|
"""Order the rows of this table based on the expression.
|
|
211
258
|
|
|
212
|
-
See [`
|
|
259
|
+
See [`Query.order_by`][pixeltable.Query.order_by] for more details.
|
|
213
260
|
"""
|
|
214
|
-
|
|
261
|
+
from pixeltable.catalog import Catalog
|
|
262
|
+
|
|
263
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
|
|
264
|
+
return self.select().order_by(*items, asc=asc)
|
|
215
265
|
|
|
216
|
-
def group_by(self, *items: 'exprs.Expr') -> 'pxt.
|
|
266
|
+
def group_by(self, *items: 'exprs.Expr') -> 'pxt.Query':
|
|
217
267
|
"""Group the rows of this table based on the expression.
|
|
218
268
|
|
|
219
|
-
See [`
|
|
269
|
+
See [`Query.group_by`][pixeltable.Query.group_by] for more details.
|
|
220
270
|
"""
|
|
221
|
-
|
|
271
|
+
from pixeltable.catalog import Catalog
|
|
222
272
|
|
|
223
|
-
|
|
273
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
|
|
274
|
+
return self.select().group_by(*items)
|
|
275
|
+
|
|
276
|
+
def distinct(self) -> 'pxt.Query':
|
|
224
277
|
"""Remove duplicate rows from table."""
|
|
225
|
-
return self.
|
|
278
|
+
return self.select().distinct()
|
|
279
|
+
|
|
280
|
+
def limit(self, n: int) -> 'pxt.Query':
|
|
281
|
+
return self.select().limit(n)
|
|
226
282
|
|
|
227
|
-
def
|
|
228
|
-
|
|
283
|
+
def sample(
|
|
284
|
+
self,
|
|
285
|
+
n: int | None = None,
|
|
286
|
+
n_per_stratum: int | None = None,
|
|
287
|
+
fraction: float | None = None,
|
|
288
|
+
seed: int | None = None,
|
|
289
|
+
stratify_by: Any = None,
|
|
290
|
+
) -> pxt.Query:
|
|
291
|
+
"""Choose a shuffled sample of rows
|
|
292
|
+
|
|
293
|
+
See [`Query.sample`][pixeltable.Query.sample] for more details.
|
|
294
|
+
"""
|
|
295
|
+
return self.select().sample(
|
|
296
|
+
n=n, n_per_stratum=n_per_stratum, fraction=fraction, seed=seed, stratify_by=stratify_by
|
|
297
|
+
)
|
|
229
298
|
|
|
230
|
-
def collect(self) -> 'pxt.
|
|
299
|
+
def collect(self) -> 'pxt._query.ResultSet':
|
|
231
300
|
"""Return rows from this table."""
|
|
232
|
-
return self.
|
|
301
|
+
return self.select().collect()
|
|
233
302
|
|
|
234
|
-
def show(self, *args: Any, **kwargs: Any) -> 'pxt.
|
|
303
|
+
def show(self, *args: Any, **kwargs: Any) -> 'pxt._query.ResultSet':
|
|
235
304
|
"""Return rows from this table."""
|
|
236
|
-
return self.
|
|
305
|
+
return self.select().show(*args, **kwargs)
|
|
237
306
|
|
|
238
|
-
def head(self, *args: Any, **kwargs: Any) -> 'pxt.
|
|
307
|
+
def head(self, *args: Any, **kwargs: Any) -> 'pxt._query.ResultSet':
|
|
239
308
|
"""Return the first n rows inserted into this table."""
|
|
240
|
-
return self.
|
|
309
|
+
return self.select().head(*args, **kwargs)
|
|
241
310
|
|
|
242
|
-
def tail(self, *args: Any, **kwargs: Any) -> 'pxt.
|
|
311
|
+
def tail(self, *args: Any, **kwargs: Any) -> 'pxt._query.ResultSet':
|
|
243
312
|
"""Return the last n rows inserted into this table."""
|
|
244
|
-
return self.
|
|
313
|
+
return self.select().tail(*args, **kwargs)
|
|
245
314
|
|
|
246
315
|
def count(self) -> int:
|
|
247
316
|
"""Return the number of rows in this table."""
|
|
248
|
-
return self.
|
|
317
|
+
return self.select().count()
|
|
249
318
|
|
|
250
|
-
@property
|
|
251
319
|
def columns(self) -> list[str]:
|
|
252
320
|
"""Return the names of the columns in this table."""
|
|
253
321
|
cols = self._tbl_version_path.columns()
|
|
254
322
|
return [c.name for c in cols]
|
|
255
323
|
|
|
256
|
-
|
|
257
|
-
def _schema(self) -> dict[str, ts.ColumnType]:
|
|
324
|
+
def _get_schema(self) -> dict[str, ts.ColumnType]:
|
|
258
325
|
"""Return the schema (column names and column types) of this table."""
|
|
259
326
|
return {c.name: c.col_type for c in self._tbl_version_path.columns()}
|
|
260
327
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
with env.Env.get().begin_xact():
|
|
264
|
-
return self._base_table
|
|
328
|
+
def get_base_table(self) -> 'Table' | None:
|
|
329
|
+
return self._get_base_table()
|
|
265
330
|
|
|
266
|
-
@property
|
|
267
331
|
@abc.abstractmethod
|
|
268
|
-
def
|
|
269
|
-
"""The base's Table instance"""
|
|
332
|
+
def _get_base_table(self) -> 'Table' | None:
|
|
333
|
+
"""The base's Table instance. Requires a transaction context"""
|
|
270
334
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
base = self._base_table
|
|
335
|
+
def _get_base_tables(self) -> list['Table']:
|
|
336
|
+
"""The ancestor list of bases of this table, starting with its immediate base. Requires a transaction context"""
|
|
337
|
+
bases: list[Table] = []
|
|
338
|
+
base = self._get_base_table()
|
|
276
339
|
while base is not None:
|
|
277
340
|
bases.append(base)
|
|
278
|
-
base = base.
|
|
341
|
+
base = base._get_base_table()
|
|
279
342
|
return bases
|
|
280
343
|
|
|
281
344
|
@property
|
|
282
345
|
@abc.abstractmethod
|
|
283
|
-
def _effective_base_versions(self) -> list[
|
|
346
|
+
def _effective_base_versions(self) -> list[int | None]:
|
|
284
347
|
"""The effective versions of the ancestor bases, starting with its immediate base."""
|
|
285
348
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
return self._tbl_version.get().comment
|
|
349
|
+
def _get_comment(self) -> str:
|
|
350
|
+
return self._tbl_version_path.comment()
|
|
289
351
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
return self._tbl_version.get().num_retained_versions
|
|
352
|
+
def _get_num_retained_versions(self) -> int:
|
|
353
|
+
return self._tbl_version_path.num_retained_versions()
|
|
293
354
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
return self._tbl_version.get().media_validation
|
|
355
|
+
def _get_media_validation(self) -> MediaValidation:
|
|
356
|
+
return self._tbl_version_path.media_validation()
|
|
297
357
|
|
|
298
358
|
def __repr__(self) -> str:
|
|
299
359
|
return self._descriptors().to_string()
|
|
@@ -305,20 +365,23 @@ class Table(SchemaObject):
|
|
|
305
365
|
"""
|
|
306
366
|
Constructs a list of descriptors for this table that can be pretty-printed.
|
|
307
367
|
"""
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
helper.append(
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
368
|
+
from pixeltable.catalog import Catalog
|
|
369
|
+
|
|
370
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=False):
|
|
371
|
+
helper = DescriptionHelper()
|
|
372
|
+
helper.append(self._table_descriptor())
|
|
373
|
+
helper.append(self._col_descriptor())
|
|
374
|
+
idxs = self._index_descriptor()
|
|
375
|
+
if not idxs.empty:
|
|
376
|
+
helper.append(idxs)
|
|
377
|
+
stores = self._external_store_descriptor()
|
|
378
|
+
if not stores.empty:
|
|
379
|
+
helper.append(stores)
|
|
380
|
+
if self._get_comment():
|
|
381
|
+
helper.append(f'COMMENT: {self._get_comment()}')
|
|
382
|
+
return helper
|
|
383
|
+
|
|
384
|
+
def _col_descriptor(self, columns: list[str] | None = None) -> pd.DataFrame:
|
|
322
385
|
return pd.DataFrame(
|
|
323
386
|
{
|
|
324
387
|
'Column Name': col.name,
|
|
@@ -329,29 +392,28 @@ class Table(SchemaObject):
|
|
|
329
392
|
if columns is None or col.name in columns
|
|
330
393
|
)
|
|
331
394
|
|
|
332
|
-
def _index_descriptor(self, columns:
|
|
395
|
+
def _index_descriptor(self, columns: list[str] | None = None) -> pd.DataFrame:
|
|
333
396
|
from pixeltable import index
|
|
334
397
|
|
|
398
|
+
if self._tbl_version is None:
|
|
399
|
+
return pd.DataFrame([])
|
|
335
400
|
pd_rows = []
|
|
336
401
|
for name, info in self._tbl_version.get().idxs_by_name.items():
|
|
337
402
|
if isinstance(info.idx, index.EmbeddingIndex) and (columns is None or info.col.name in columns):
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
embed_str = f'{display_embed} (+1)'
|
|
341
|
-
else:
|
|
342
|
-
embed_str = str(display_embed)
|
|
403
|
+
col_ref = ColumnRef(info.col)
|
|
404
|
+
embedding = info.idx.embeddings[info.col.col_type._type](col_ref)
|
|
343
405
|
row = {
|
|
344
406
|
'Index Name': name,
|
|
345
407
|
'Column': info.col.name,
|
|
346
408
|
'Metric': str(info.idx.metric.name.lower()),
|
|
347
|
-
'Embedding':
|
|
409
|
+
'Embedding': str(embedding),
|
|
348
410
|
}
|
|
349
411
|
pd_rows.append(row)
|
|
350
412
|
return pd.DataFrame(pd_rows)
|
|
351
413
|
|
|
352
414
|
def _external_store_descriptor(self) -> pd.DataFrame:
|
|
353
415
|
pd_rows = []
|
|
354
|
-
for name, store in self.
|
|
416
|
+
for name, store in self._tbl_version_path.tbl_version.get().external_stores.items():
|
|
355
417
|
row = {'External Store': name, 'Type': type(store).__name__}
|
|
356
418
|
pd_rows.append(row)
|
|
357
419
|
return pd.DataFrame(pd_rows)
|
|
@@ -360,7 +422,6 @@ class Table(SchemaObject):
|
|
|
360
422
|
"""
|
|
361
423
|
Print the table schema.
|
|
362
424
|
"""
|
|
363
|
-
self._check_is_dropped()
|
|
364
425
|
if getattr(builtins, '__IPYTHON__', False):
|
|
365
426
|
from IPython.display import Markdown, display
|
|
366
427
|
|
|
@@ -368,31 +429,28 @@ class Table(SchemaObject):
|
|
|
368
429
|
else:
|
|
369
430
|
print(repr(self))
|
|
370
431
|
|
|
371
|
-
def _drop(self) -> None:
|
|
372
|
-
self._check_is_dropped()
|
|
373
|
-
self._tbl_version.get().drop()
|
|
374
|
-
self._is_dropped = True
|
|
375
|
-
|
|
376
432
|
# TODO Factor this out into a separate module.
|
|
377
433
|
# The return type is unresolvable, but torch can't be imported since it's an optional dependency.
|
|
378
434
|
def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
|
|
379
435
|
"""Return a PyTorch Dataset for this table.
|
|
380
|
-
See
|
|
436
|
+
See Query.to_pytorch_dataset()
|
|
381
437
|
"""
|
|
382
|
-
return self.
|
|
438
|
+
return self.select().to_pytorch_dataset(image_format=image_format)
|
|
383
439
|
|
|
384
440
|
def to_coco_dataset(self) -> Path:
|
|
385
441
|
"""Return the path to a COCO json file for this table.
|
|
386
|
-
See
|
|
442
|
+
See Query.to_coco_dataset()
|
|
387
443
|
"""
|
|
388
|
-
return self.
|
|
444
|
+
return self.select().to_coco_dataset()
|
|
389
445
|
|
|
390
446
|
def _column_has_dependents(self, col: Column) -> bool:
|
|
391
447
|
"""Returns True if the column has dependents, False otherwise."""
|
|
392
448
|
assert col is not None
|
|
393
|
-
assert col.name in self.
|
|
394
|
-
|
|
449
|
+
assert col.name in self._get_schema()
|
|
450
|
+
cat = catalog.Catalog.get()
|
|
451
|
+
if any(c.name is not None for c in cat.get_column_dependents(col.get_tbl().id, col.id)):
|
|
395
452
|
return True
|
|
453
|
+
assert self._tbl_version is not None
|
|
396
454
|
return any(
|
|
397
455
|
col in store.get_local_columns()
|
|
398
456
|
for view in (self, *self._get_views(recursive=True))
|
|
@@ -404,13 +462,13 @@ class Table(SchemaObject):
|
|
|
404
462
|
|
|
405
463
|
If `if_exists='ignore'`, returns a list of existing columns, if any, in `new_col_names`.
|
|
406
464
|
"""
|
|
407
|
-
assert not
|
|
408
|
-
existing_col_names = set(self.
|
|
465
|
+
assert self._tbl_version is not None
|
|
466
|
+
existing_col_names = set(self._get_schema().keys())
|
|
409
467
|
cols_to_ignore = []
|
|
410
468
|
for new_col_name in new_col_names:
|
|
411
469
|
if new_col_name in existing_col_names:
|
|
412
470
|
if if_exists == IfExistsParam.ERROR:
|
|
413
|
-
raise excs.Error(f'Duplicate column name: {new_col_name
|
|
471
|
+
raise excs.Error(f'Duplicate column name: {new_col_name}')
|
|
414
472
|
elif if_exists == IfExistsParam.IGNORE:
|
|
415
473
|
cols_to_ignore.append(new_col_name)
|
|
416
474
|
elif if_exists in (IfExistsParam.REPLACE, IfExistsParam.REPLACE_FORCE):
|
|
@@ -433,15 +491,14 @@ class Table(SchemaObject):
|
|
|
433
491
|
|
|
434
492
|
def add_columns(
|
|
435
493
|
self,
|
|
436
|
-
schema: dict[str,
|
|
494
|
+
schema: dict[str, ts.ColumnType | builtins.type | _GenericAlias],
|
|
437
495
|
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
438
496
|
) -> UpdateStatus:
|
|
439
497
|
"""
|
|
440
498
|
Adds multiple columns to the table. The columns must be concrete (non-computed) columns; to add computed
|
|
441
499
|
columns, use [`add_computed_column()`][pixeltable.catalog.Table.add_computed_column] instead.
|
|
442
500
|
|
|
443
|
-
The format of the `schema` argument is
|
|
444
|
-
[`create_table()`][pixeltable.globals.create_table].
|
|
501
|
+
The format of the `schema` argument is a dict mapping column names to their types.
|
|
445
502
|
|
|
446
503
|
Args:
|
|
447
504
|
schema: A dictionary mapping column names to types.
|
|
@@ -473,15 +530,16 @@ class Table(SchemaObject):
|
|
|
473
530
|
... }
|
|
474
531
|
... tbl.add_columns(schema)
|
|
475
532
|
"""
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
533
|
+
from pixeltable.catalog import Catalog
|
|
534
|
+
|
|
535
|
+
# lock_mutable_tree=True: we might end up having to drop existing columns, which requires locking the tree
|
|
536
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
537
|
+
self.__check_mutable('add columns to')
|
|
538
|
+
col_schema = {
|
|
539
|
+
col_name: {'type': ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)}
|
|
540
|
+
for col_name, spec in schema.items()
|
|
541
|
+
}
|
|
542
|
+
|
|
485
543
|
# handle existing columns based on if_exists parameter
|
|
486
544
|
cols_to_ignore = self._ignore_or_drop_existing_columns(
|
|
487
545
|
list(col_schema.keys()), IfExistsParam.validated(if_exists, 'if_exists')
|
|
@@ -491,20 +549,22 @@ class Table(SchemaObject):
|
|
|
491
549
|
for cname in cols_to_ignore:
|
|
492
550
|
assert cname in col_schema
|
|
493
551
|
del col_schema[cname]
|
|
552
|
+
result = UpdateStatus()
|
|
494
553
|
if len(col_schema) == 0:
|
|
495
|
-
return
|
|
554
|
+
return result
|
|
496
555
|
new_cols = self._create_columns(col_schema)
|
|
497
556
|
for new_col in new_cols:
|
|
498
557
|
self._verify_column(new_col)
|
|
499
|
-
|
|
558
|
+
assert self._tbl_version is not None
|
|
559
|
+
result += self._tbl_version.get().add_columns(new_cols, print_stats=False, on_error='abort')
|
|
500
560
|
FileCache.get().emit_eviction_warnings()
|
|
501
|
-
return
|
|
561
|
+
return result
|
|
502
562
|
|
|
503
563
|
def add_column(
|
|
504
564
|
self,
|
|
505
565
|
*,
|
|
506
566
|
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
507
|
-
**kwargs:
|
|
567
|
+
**kwargs: ts.ColumnType | builtins.type | _GenericAlias | exprs.Expr,
|
|
508
568
|
) -> UpdateStatus:
|
|
509
569
|
"""
|
|
510
570
|
Adds an ordinary (non-computed) column to the table.
|
|
@@ -515,7 +575,7 @@ class Table(SchemaObject):
|
|
|
515
575
|
|
|
516
576
|
- `'error'`: an exception will be raised.
|
|
517
577
|
- `'ignore'`: do nothing and return.
|
|
518
|
-
- `'replace' or 'replace_force'`: drop the existing column and add the new column, if it has
|
|
578
|
+
- `'replace'` or `'replace_force'`: drop the existing column and add the new column, if it has
|
|
519
579
|
no dependents.
|
|
520
580
|
|
|
521
581
|
Returns:
|
|
@@ -534,15 +594,11 @@ class Table(SchemaObject):
|
|
|
534
594
|
|
|
535
595
|
>>> tbl.add_columns({'new_col': pxt.Int})
|
|
536
596
|
"""
|
|
537
|
-
self._check_is_dropped()
|
|
538
|
-
# verify kwargs
|
|
539
|
-
if self._tbl_version.get().is_snapshot:
|
|
540
|
-
raise excs.Error('Cannot add column to a snapshot.')
|
|
541
597
|
# verify kwargs and construct column schema dict
|
|
542
598
|
if len(kwargs) != 1:
|
|
543
599
|
raise excs.Error(
|
|
544
|
-
f'add_column() requires exactly one keyword argument of the form
|
|
545
|
-
f'got {len(kwargs)} instead ({", ".join(kwargs.keys())})'
|
|
600
|
+
f'add_column() requires exactly one keyword argument of the form `col_name=col_type`; '
|
|
601
|
+
f'got {len(kwargs)} arguments instead ({", ".join(kwargs.keys())})'
|
|
546
602
|
)
|
|
547
603
|
col_type = next(iter(kwargs.values()))
|
|
548
604
|
if not isinstance(col_type, (ts.ColumnType, type, _GenericAlias)):
|
|
@@ -554,7 +610,8 @@ class Table(SchemaObject):
|
|
|
554
610
|
def add_computed_column(
|
|
555
611
|
self,
|
|
556
612
|
*,
|
|
557
|
-
stored:
|
|
613
|
+
stored: bool | None = None,
|
|
614
|
+
destination: str | Path | None = None,
|
|
558
615
|
print_stats: bool = False,
|
|
559
616
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
560
617
|
if_exists: Literal['error', 'ignore', 'replace'] = 'error',
|
|
@@ -566,6 +623,7 @@ class Table(SchemaObject):
|
|
|
566
623
|
Args:
|
|
567
624
|
kwargs: Exactly one keyword argument of the form `col_name=expression`.
|
|
568
625
|
stored: Whether the column is materialized and stored or computed on demand.
|
|
626
|
+
destination: An object store reference for persisting computed files.
|
|
569
627
|
print_stats: If `True`, print execution metrics during evaluation.
|
|
570
628
|
on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
|
|
571
629
|
row.
|
|
@@ -573,7 +631,7 @@ class Table(SchemaObject):
|
|
|
573
631
|
- `'abort'`: an exception will be raised and the column will not be added.
|
|
574
632
|
- `'ignore'`: execution will continue and the column will be added. Any rows
|
|
575
633
|
with errors will have a `None` value for the column, with information about the error stored in the
|
|
576
|
-
corresponding `tbl.col_name.
|
|
634
|
+
corresponding `tbl.col_name.errormsg` and `tbl.col_name.errortype` fields.
|
|
577
635
|
if_exists: Determines the behavior if the column already exists. Must be one of the following:
|
|
578
636
|
|
|
579
637
|
- `'error'`: an exception will be raised.
|
|
@@ -598,48 +656,53 @@ class Table(SchemaObject):
|
|
|
598
656
|
|
|
599
657
|
>>> tbl.add_computed_column(rotated=tbl.frame.rotate(90), stored=False)
|
|
600
658
|
"""
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
)
|
|
659
|
+
from pixeltable.catalog import Catalog
|
|
660
|
+
|
|
661
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
662
|
+
self.__check_mutable('add columns to')
|
|
663
|
+
if len(kwargs) != 1:
|
|
664
|
+
raise excs.Error(
|
|
665
|
+
f'add_computed_column() requires exactly one keyword argument of the form '
|
|
666
|
+
'`col_name=col_type` or `col_name=expression`; '
|
|
667
|
+
f'got {len(kwargs)} arguments instead ({", ".join(kwargs.keys())})'
|
|
668
|
+
)
|
|
669
|
+
col_name, spec = next(iter(kwargs.items()))
|
|
670
|
+
if not is_valid_identifier(col_name):
|
|
671
|
+
raise excs.Error(f'Invalid column name: {col_name}')
|
|
672
|
+
|
|
673
|
+
col_schema: dict[str, Any] = {'value': spec}
|
|
674
|
+
if stored is not None:
|
|
675
|
+
col_schema['stored'] = stored
|
|
676
|
+
|
|
677
|
+
if destination is not None:
|
|
678
|
+
col_schema['destination'] = destination
|
|
679
|
+
|
|
680
|
+
# Raise an error if the column expression refers to a column error property
|
|
681
|
+
if isinstance(spec, exprs.Expr):
|
|
682
|
+
for e in spec.subexprs(expr_class=exprs.ColumnPropertyRef, traverse_matches=False):
|
|
683
|
+
if e.is_cellmd_prop():
|
|
684
|
+
raise excs.Error(
|
|
685
|
+
f'Use of a reference to the {e.prop.name.lower()!r} property of another column '
|
|
686
|
+
f'is not allowed in a computed column.'
|
|
687
|
+
)
|
|
626
688
|
|
|
627
|
-
with Env.get().begin_xact():
|
|
628
689
|
# handle existing columns based on if_exists parameter
|
|
629
690
|
cols_to_ignore = self._ignore_or_drop_existing_columns(
|
|
630
691
|
[col_name], IfExistsParam.validated(if_exists, 'if_exists')
|
|
631
692
|
)
|
|
632
693
|
# if the column to add already exists and user asked to ignore
|
|
633
|
-
#
|
|
694
|
+
# existing column, there's nothing to do.
|
|
695
|
+
result = UpdateStatus()
|
|
634
696
|
if len(cols_to_ignore) != 0:
|
|
635
697
|
assert cols_to_ignore[0] == col_name
|
|
636
|
-
return
|
|
698
|
+
return result
|
|
637
699
|
|
|
638
700
|
new_col = self._create_columns({col_name: col_schema})[0]
|
|
639
701
|
self._verify_column(new_col)
|
|
640
|
-
|
|
702
|
+
assert self._tbl_version is not None
|
|
703
|
+
result += self._tbl_version.get().add_columns([new_col], print_stats=print_stats, on_error=on_error)
|
|
641
704
|
FileCache.get().emit_eviction_warnings()
|
|
642
|
-
return
|
|
705
|
+
return result
|
|
643
706
|
|
|
644
707
|
@classmethod
|
|
645
708
|
def _validate_column_spec(cls, name: str, spec: dict[str, Any]) -> None:
|
|
@@ -649,40 +712,45 @@ class Table(SchemaObject):
|
|
|
649
712
|
(on account of containing Python Callables or Exprs).
|
|
650
713
|
"""
|
|
651
714
|
assert isinstance(spec, dict)
|
|
652
|
-
valid_keys = {'type', 'value', 'stored', 'media_validation'}
|
|
715
|
+
valid_keys = {'type', 'value', 'stored', 'media_validation', 'destination'}
|
|
653
716
|
for k in spec:
|
|
654
717
|
if k not in valid_keys:
|
|
655
|
-
raise excs.Error(f'Column {name}: invalid key {k!r}')
|
|
718
|
+
raise excs.Error(f'Column {name!r}: invalid key {k!r}')
|
|
656
719
|
|
|
657
720
|
if 'type' not in spec and 'value' not in spec:
|
|
658
|
-
raise excs.Error(f"Column {name}: 'type' or 'value' must be specified")
|
|
721
|
+
raise excs.Error(f"Column {name!r}: 'type' or 'value' must be specified")
|
|
659
722
|
|
|
660
723
|
if 'type' in spec and not isinstance(spec['type'], (ts.ColumnType, type, _GenericAlias)):
|
|
661
|
-
raise excs.Error(f
|
|
724
|
+
raise excs.Error(f"Column {name!r}: 'type' must be a type or ColumnType; got {spec['type']}")
|
|
662
725
|
|
|
663
726
|
if 'value' in spec:
|
|
664
727
|
value_expr = exprs.Expr.from_object(spec['value'])
|
|
665
728
|
if value_expr is None:
|
|
666
|
-
raise excs.Error(f
|
|
729
|
+
raise excs.Error(f"Column {name!r}: 'value' must be a Pixeltable expression.")
|
|
667
730
|
if 'type' in spec:
|
|
668
|
-
raise excs.Error(f"Column {name}: 'type' is redundant if 'value' is specified")
|
|
731
|
+
raise excs.Error(f"Column {name!r}: 'type' is redundant if 'value' is specified")
|
|
669
732
|
|
|
670
733
|
if 'media_validation' in spec:
|
|
671
|
-
_ = catalog.MediaValidation.validated(spec['media_validation'], f'Column {name}: media_validation')
|
|
734
|
+
_ = catalog.MediaValidation.validated(spec['media_validation'], f'Column {name!r}: media_validation')
|
|
672
735
|
|
|
673
736
|
if 'stored' in spec and not isinstance(spec['stored'], bool):
|
|
674
|
-
raise excs.Error(f
|
|
737
|
+
raise excs.Error(f"Column {name!r}: 'stored' must be a bool; got {spec['stored']}")
|
|
738
|
+
|
|
739
|
+
d = spec.get('destination')
|
|
740
|
+
if d is not None and not isinstance(d, (str, Path)):
|
|
741
|
+
raise excs.Error(f'Column {name!r}: `destination` must be a string or path; got {d}')
|
|
675
742
|
|
|
676
743
|
@classmethod
|
|
677
744
|
def _create_columns(cls, schema: dict[str, Any]) -> list[Column]:
|
|
678
745
|
"""Construct list of Columns, given schema"""
|
|
679
746
|
columns: list[Column] = []
|
|
680
747
|
for name, spec in schema.items():
|
|
681
|
-
col_type:
|
|
682
|
-
value_expr:
|
|
748
|
+
col_type: ts.ColumnType | None = None
|
|
749
|
+
value_expr: exprs.Expr | None = None
|
|
683
750
|
primary_key: bool = False
|
|
684
|
-
media_validation:
|
|
751
|
+
media_validation: catalog.MediaValidation | None = None
|
|
685
752
|
stored = True
|
|
753
|
+
destination: str | None = None
|
|
686
754
|
|
|
687
755
|
if isinstance(spec, (ts.ColumnType, type, _GenericAlias)):
|
|
688
756
|
col_type = ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)
|
|
@@ -707,6 +775,7 @@ class Table(SchemaObject):
|
|
|
707
775
|
media_validation = (
|
|
708
776
|
catalog.MediaValidation[media_validation_str.upper()] if media_validation_str is not None else None
|
|
709
777
|
)
|
|
778
|
+
destination = spec.get('destination')
|
|
710
779
|
else:
|
|
711
780
|
raise excs.Error(f'Invalid value for column {name!r}')
|
|
712
781
|
|
|
@@ -717,41 +786,46 @@ class Table(SchemaObject):
|
|
|
717
786
|
stored=stored,
|
|
718
787
|
is_pk=primary_key,
|
|
719
788
|
media_validation=media_validation,
|
|
789
|
+
destination=destination,
|
|
720
790
|
)
|
|
791
|
+
# Validate the column's resolved_destination. This will ensure that if the column uses a default (global)
|
|
792
|
+
# media destination, it gets validated at this time.
|
|
793
|
+
ObjectOps.validate_destination(column.destination, column.name)
|
|
721
794
|
columns.append(column)
|
|
795
|
+
|
|
722
796
|
return columns
|
|
723
797
|
|
|
724
798
|
@classmethod
|
|
725
799
|
def validate_column_name(cls, name: str) -> None:
|
|
726
|
-
"""Check that a name is usable as a
|
|
800
|
+
"""Check that a name is usable as a pixeltable column name"""
|
|
727
801
|
if is_system_column_name(name) or is_python_keyword(name):
|
|
728
802
|
raise excs.Error(f'{name!r} is a reserved name in Pixeltable; please choose a different column name.')
|
|
729
803
|
if not is_valid_identifier(name):
|
|
730
|
-
raise excs.Error(f'Invalid column name: {name
|
|
804
|
+
raise excs.Error(f'Invalid column name: {name}')
|
|
731
805
|
|
|
732
806
|
@classmethod
|
|
733
807
|
def _verify_column(cls, col: Column) -> None:
|
|
734
808
|
"""Check integrity of user-supplied Column and supply defaults"""
|
|
735
809
|
cls.validate_column_name(col.name)
|
|
736
810
|
if col.stored is False and not col.is_computed:
|
|
737
|
-
raise excs.Error(f'Column {col.name!r}: stored={col.stored} only applies to computed columns')
|
|
811
|
+
raise excs.Error(f'Column {col.name!r}: `stored={col.stored}` only applies to computed columns')
|
|
738
812
|
if col.stored is False and col.has_window_fn_call():
|
|
739
813
|
raise excs.Error(
|
|
740
814
|
(
|
|
741
|
-
f'Column {col.name!r}: stored={col.stored} is not valid for image columns computed with a '
|
|
815
|
+
f'Column {col.name!r}: `stored={col.stored}` is not valid for image columns computed with a '
|
|
742
816
|
f'streaming function'
|
|
743
817
|
)
|
|
744
818
|
)
|
|
819
|
+
if col._explicit_destination is not None and not (col.stored and col.is_computed):
|
|
820
|
+
raise excs.Error(f'Column {col.name!r}: `destination` property only applies to stored computed columns')
|
|
745
821
|
|
|
746
822
|
@classmethod
|
|
747
823
|
def _verify_schema(cls, schema: list[Column]) -> None:
|
|
748
824
|
"""Check integrity of user-supplied schema and set defaults"""
|
|
749
|
-
column_names: set[str] = set()
|
|
750
825
|
for col in schema:
|
|
751
826
|
cls._verify_column(col)
|
|
752
|
-
column_names.add(col.name)
|
|
753
827
|
|
|
754
|
-
def drop_column(self, column:
|
|
828
|
+
def drop_column(self, column: str | ColumnRef, if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
|
|
755
829
|
"""Drop a column from the table.
|
|
756
830
|
|
|
757
831
|
Args:
|
|
@@ -781,53 +855,88 @@ class Table(SchemaObject):
|
|
|
781
855
|
>>> tbl = pxt.get_table('my_table')
|
|
782
856
|
... tbl.drop_col(tbl.col, if_not_exists='ignore')
|
|
783
857
|
"""
|
|
784
|
-
|
|
785
|
-
if self._tbl_version_path.is_snapshot():
|
|
786
|
-
raise excs.Error('Cannot drop column from a snapshot.')
|
|
787
|
-
col: Column = None
|
|
788
|
-
if_not_exists_ = IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
789
|
-
if isinstance(column, str):
|
|
790
|
-
col = self._tbl_version_path.get_column(column, include_bases=False)
|
|
791
|
-
if col is None:
|
|
792
|
-
if if_not_exists_ == IfNotExistsParam.ERROR:
|
|
793
|
-
raise excs.Error(f'Column {column!r} unknown')
|
|
794
|
-
assert if_not_exists_ == IfNotExistsParam.IGNORE
|
|
795
|
-
return
|
|
796
|
-
col = self._tbl_version.get().cols_by_name[column]
|
|
797
|
-
else:
|
|
798
|
-
exists = self._tbl_version_path.has_column(column.col, include_bases=False)
|
|
799
|
-
if not exists:
|
|
800
|
-
if if_not_exists_ == IfNotExistsParam.ERROR:
|
|
801
|
-
raise excs.Error(f'Unknown column: {column.col.qualified_name}')
|
|
802
|
-
assert if_not_exists_ == IfNotExistsParam.IGNORE
|
|
803
|
-
return
|
|
804
|
-
col = column.col
|
|
858
|
+
from pixeltable.catalog import Catalog
|
|
805
859
|
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
860
|
+
cat = Catalog.get()
|
|
861
|
+
|
|
862
|
+
# lock_mutable_tree=True: we need to be able to see whether any transitive view has column dependents
|
|
863
|
+
with cat.begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
864
|
+
self.__check_mutable('drop columns from')
|
|
865
|
+
col: Column = None
|
|
866
|
+
if_not_exists_ = IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
867
|
+
|
|
868
|
+
if isinstance(column, str):
|
|
869
|
+
col = self._tbl_version_path.get_column(column)
|
|
870
|
+
if col is None:
|
|
871
|
+
if if_not_exists_ == IfNotExistsParam.ERROR:
|
|
872
|
+
raise excs.Error(f'Unknown column: {column}')
|
|
873
|
+
assert if_not_exists_ == IfNotExistsParam.IGNORE
|
|
874
|
+
return
|
|
875
|
+
if col.get_tbl().id != self._tbl_version_path.tbl_id:
|
|
876
|
+
raise excs.Error(f'Cannot drop base table column {col.name!r}')
|
|
877
|
+
col = self._tbl_version.get().cols_by_name[column]
|
|
878
|
+
else:
|
|
879
|
+
exists = self._tbl_version_path.has_column(column.col)
|
|
880
|
+
if not exists:
|
|
881
|
+
if if_not_exists_ == IfNotExistsParam.ERROR:
|
|
882
|
+
raise excs.Error(f'Unknown column: {column.col.qualified_name}')
|
|
883
|
+
assert if_not_exists_ == IfNotExistsParam.IGNORE
|
|
884
|
+
return
|
|
885
|
+
col = column.col
|
|
886
|
+
if col.get_tbl().id != self._tbl_version_path.tbl_id:
|
|
887
|
+
raise excs.Error(f'Cannot drop base table column {col.name!r}')
|
|
888
|
+
|
|
889
|
+
dependent_user_cols = [c for c in cat.get_column_dependents(col.get_tbl().id, col.id) if c.name is not None]
|
|
890
|
+
if len(dependent_user_cols) > 0:
|
|
891
|
+
raise excs.Error(
|
|
892
|
+
f'Cannot drop column {col.name!r} because the following columns depend on it:\n'
|
|
893
|
+
f'{", ".join(c.name for c in dependent_user_cols)}'
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
views = self._get_views(recursive=True, mutable_only=True)
|
|
897
|
+
|
|
898
|
+
# See if any view predicates depend on this column
|
|
899
|
+
dependent_views: list[tuple[Table, exprs.Expr]] = []
|
|
900
|
+
for view in views:
|
|
901
|
+
if view._tbl_version is not None:
|
|
902
|
+
predicate = view._tbl_version.get().predicate
|
|
903
|
+
if predicate is not None:
|
|
904
|
+
for predicate_col in exprs.Expr.get_refd_column_ids(predicate.as_dict()):
|
|
905
|
+
if predicate_col.tbl_id == col.get_tbl().id and predicate_col.col_id == col.id:
|
|
906
|
+
dependent_views.append((view, predicate))
|
|
907
|
+
|
|
908
|
+
if len(dependent_views) > 0:
|
|
909
|
+
dependent_views_str = '\n'.join(
|
|
910
|
+
f'view: {view._path()}, predicate: {predicate}' for view, predicate in dependent_views
|
|
911
|
+
)
|
|
912
|
+
raise excs.Error(
|
|
913
|
+
f'Cannot drop column {col.name!r} because the following views depend on it:\n{dependent_views_str}'
|
|
914
|
+
)
|
|
812
915
|
|
|
813
|
-
with Env.get().begin_xact():
|
|
814
916
|
# See if this column has a dependent store. We need to look through all stores in all
|
|
815
917
|
# (transitive) views of this table.
|
|
918
|
+
col_handle = col.handle
|
|
816
919
|
dependent_stores = [
|
|
817
920
|
(view, store)
|
|
818
|
-
for view in (self, *
|
|
921
|
+
for view in (self, *views)
|
|
819
922
|
for store in view._tbl_version.get().external_stores.values()
|
|
820
|
-
if
|
|
923
|
+
if col_handle in store.get_local_columns()
|
|
821
924
|
]
|
|
822
925
|
if len(dependent_stores) > 0:
|
|
823
926
|
dependent_store_names = [
|
|
824
|
-
store.name if view._id == self._id else f'{store.name} (in view
|
|
927
|
+
store.name if view._id == self._id else f'{store.name} (in view {view._name!r})'
|
|
825
928
|
for view, store in dependent_stores
|
|
826
929
|
]
|
|
827
930
|
raise excs.Error(
|
|
828
|
-
f'Cannot drop column
|
|
931
|
+
f'Cannot drop column {col.name!r} because the following external stores depend on it:\n'
|
|
829
932
|
f'{", ".join(dependent_store_names)}'
|
|
830
933
|
)
|
|
934
|
+
all_columns = self.columns()
|
|
935
|
+
if len(all_columns) == 1 and col.name == all_columns[0]:
|
|
936
|
+
raise excs.Error(
|
|
937
|
+
f'Cannot drop column {col.name!r} because it is the last remaining column in this table.'
|
|
938
|
+
f' Tables must have at least one column.'
|
|
939
|
+
)
|
|
831
940
|
|
|
832
941
|
self._tbl_version.get().drop_column(col)
|
|
833
942
|
|
|
@@ -847,7 +956,9 @@ class Table(SchemaObject):
|
|
|
847
956
|
>>> tbl = pxt.get_table('my_table')
|
|
848
957
|
... tbl.rename_column('col1', 'col2')
|
|
849
958
|
"""
|
|
850
|
-
|
|
959
|
+
from pixeltable.catalog import Catalog
|
|
960
|
+
|
|
961
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=False):
|
|
851
962
|
self._tbl_version.get().rename_column(old_name, new_name)
|
|
852
963
|
|
|
853
964
|
def _list_index_info_for_test(self) -> list[dict[str, Any]]:
|
|
@@ -858,7 +969,6 @@ class Table(SchemaObject):
|
|
|
858
969
|
A list of index information, each containing the index's
|
|
859
970
|
id, name, and the name of the column it indexes.
|
|
860
971
|
"""
|
|
861
|
-
assert not self._is_dropped
|
|
862
972
|
index_info = []
|
|
863
973
|
for idx_name, idx in self._tbl_version.get().idxs_by_name.items():
|
|
864
974
|
index_info.append({'_id': idx.id, '_name': idx_name, '_column': idx.col.name})
|
|
@@ -866,13 +976,13 @@ class Table(SchemaObject):
|
|
|
866
976
|
|
|
867
977
|
def add_embedding_index(
|
|
868
978
|
self,
|
|
869
|
-
column:
|
|
979
|
+
column: str | ColumnRef,
|
|
870
980
|
*,
|
|
871
|
-
idx_name:
|
|
872
|
-
embedding:
|
|
873
|
-
string_embed:
|
|
874
|
-
image_embed:
|
|
875
|
-
metric:
|
|
981
|
+
idx_name: str | None = None,
|
|
982
|
+
embedding: pxt.Function | None = None,
|
|
983
|
+
string_embed: pxt.Function | None = None,
|
|
984
|
+
image_embed: pxt.Function | None = None,
|
|
985
|
+
metric: Literal['cosine', 'ip', 'l2'] = 'cosine',
|
|
876
986
|
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
877
987
|
) -> None:
|
|
878
988
|
"""
|
|
@@ -880,25 +990,33 @@ class Table(SchemaObject):
|
|
|
880
990
|
rows are inserted into the table.
|
|
881
991
|
|
|
882
992
|
To add an embedding index, one must specify, at minimum, the column to be indexed and an embedding UDF.
|
|
883
|
-
Only `String` and `Image` columns are currently supported.
|
|
884
|
-
|
|
993
|
+
Only `String` and `Image` columns are currently supported.
|
|
994
|
+
|
|
995
|
+
Examples:
|
|
996
|
+
Here's an example that uses a
|
|
997
|
+
[CLIP embedding][pixeltable.functions.huggingface.clip] to index an image column:
|
|
998
|
+
|
|
999
|
+
>>> from pixeltable.functions.huggingface import clip
|
|
1000
|
+
>>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
|
|
1001
|
+
>>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
|
|
1002
|
+
|
|
1003
|
+
Once the index is created, similarity lookups can be performed using the `similarity` pseudo-function:
|
|
885
1004
|
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
1005
|
+
>>> reference_img = PIL.Image.open('my_image.jpg')
|
|
1006
|
+
>>> sim = tbl.img.similarity(image=reference_img)
|
|
1007
|
+
>>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
|
|
889
1008
|
|
|
890
|
-
|
|
1009
|
+
If the embedding UDF is a multimodal embedding (supporting more than one data type), then lookups may be
|
|
1010
|
+
performed using any of its supported modalities. In our example, CLIP supports both text and images, so we
|
|
1011
|
+
can also search for images using a text description:
|
|
891
1012
|
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
|
|
1013
|
+
>>> sim = tbl.img.similarity(string='a picture of a train')
|
|
1014
|
+
>>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
|
|
895
1015
|
|
|
896
|
-
|
|
897
|
-
performed using any of its supported types. In our example, CLIP supports both text and images, so we can
|
|
898
|
-
also search for images using a text description:
|
|
1016
|
+
Audio and video lookups would look like this:
|
|
899
1017
|
|
|
900
|
-
|
|
901
|
-
|
|
1018
|
+
>>> sim = tbl.img.similarity(audio='/path/to/audio.flac')
|
|
1019
|
+
>>> sim = tbl.img.similarity(video='/path/to/video.mp4')
|
|
902
1020
|
|
|
903
1021
|
Args:
|
|
904
1022
|
column: The name of, or reference to, the column to be indexed; must be a `String` or `Image` column.
|
|
@@ -929,9 +1047,9 @@ class Table(SchemaObject):
|
|
|
929
1047
|
Add an index to the `img` column of the table `my_table`:
|
|
930
1048
|
|
|
931
1049
|
>>> from pixeltable.functions.huggingface import clip
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
1050
|
+
>>> tbl = pxt.get_table('my_table')
|
|
1051
|
+
>>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
|
|
1052
|
+
>>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
|
|
935
1053
|
|
|
936
1054
|
Alternatively, the `img` column may be specified by name:
|
|
937
1055
|
|
|
@@ -955,11 +1073,12 @@ class Table(SchemaObject):
|
|
|
955
1073
|
... image_embed=image_embedding_fn
|
|
956
1074
|
... )
|
|
957
1075
|
"""
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
1076
|
+
from pixeltable.catalog import Catalog
|
|
1077
|
+
|
|
1078
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
1079
|
+
self.__check_mutable('add an index to')
|
|
1080
|
+
col = self._resolve_column_parameter(column)
|
|
961
1081
|
|
|
962
|
-
with Env.get().begin_xact():
|
|
963
1082
|
if idx_name is not None and idx_name in self._tbl_version.get().idxs_by_name:
|
|
964
1083
|
if_exists_ = IfExistsParam.validated(if_exists, 'if_exists')
|
|
965
1084
|
# An index with the same name already exists.
|
|
@@ -968,7 +1087,7 @@ class Table(SchemaObject):
|
|
|
968
1087
|
raise excs.Error(f'Duplicate index name: {idx_name}')
|
|
969
1088
|
if not isinstance(self._tbl_version.get().idxs_by_name[idx_name].idx, index.EmbeddingIndex):
|
|
970
1089
|
raise excs.Error(
|
|
971
|
-
f'Index
|
|
1090
|
+
f'Index {idx_name!r} is not an embedding index. Cannot {if_exists_.name.lower()} it.'
|
|
972
1091
|
)
|
|
973
1092
|
if if_exists_ == IfExistsParam.IGNORE:
|
|
974
1093
|
return
|
|
@@ -981,10 +1100,9 @@ class Table(SchemaObject):
|
|
|
981
1100
|
if idx_name is not None:
|
|
982
1101
|
Table.validate_column_name(idx_name)
|
|
983
1102
|
|
|
984
|
-
#
|
|
985
|
-
idx = EmbeddingIndex(
|
|
986
|
-
|
|
987
|
-
)
|
|
1103
|
+
# validate EmbeddingIndex args
|
|
1104
|
+
idx = EmbeddingIndex(metric=metric, embed=embedding, string_embed=string_embed, image_embed=image_embed)
|
|
1105
|
+
_ = idx.create_value_expr(col)
|
|
988
1106
|
_ = self._tbl_version.get().add_index(col, idx_name=idx_name, idx=idx)
|
|
989
1107
|
# TODO: how to deal with exceptions here? drop the index and raise?
|
|
990
1108
|
FileCache.get().emit_eviction_warnings()
|
|
@@ -992,8 +1110,8 @@ class Table(SchemaObject):
|
|
|
992
1110
|
def drop_embedding_index(
|
|
993
1111
|
self,
|
|
994
1112
|
*,
|
|
995
|
-
column:
|
|
996
|
-
idx_name:
|
|
1113
|
+
column: str | ColumnRef | None = None,
|
|
1114
|
+
idx_name: str | None = None,
|
|
997
1115
|
if_not_exists: Literal['error', 'ignore'] = 'error',
|
|
998
1116
|
) -> None:
|
|
999
1117
|
"""
|
|
@@ -1039,26 +1157,28 @@ class Table(SchemaObject):
|
|
|
1039
1157
|
>>> tbl = pxt.get_table('my_table')
|
|
1040
1158
|
... tbl.drop_embedding_index(idx_name='idx1', if_not_exists='ignore')
|
|
1041
1159
|
"""
|
|
1160
|
+
from pixeltable.catalog import Catalog
|
|
1161
|
+
|
|
1042
1162
|
if (column is None) == (idx_name is None):
|
|
1043
1163
|
raise excs.Error("Exactly one of 'column' or 'idx_name' must be provided")
|
|
1044
1164
|
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1165
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
1166
|
+
col: Column = None
|
|
1167
|
+
if idx_name is None:
|
|
1168
|
+
col = self._resolve_column_parameter(column)
|
|
1169
|
+
assert col is not None
|
|
1049
1170
|
|
|
1050
|
-
with Env.get().begin_xact():
|
|
1051
1171
|
self._drop_index(col=col, idx_name=idx_name, _idx_class=index.EmbeddingIndex, if_not_exists=if_not_exists)
|
|
1052
1172
|
|
|
1053
|
-
def _resolve_column_parameter(self, column:
|
|
1173
|
+
def _resolve_column_parameter(self, column: str | ColumnRef) -> Column:
|
|
1054
1174
|
"""Resolve a column parameter to a Column object"""
|
|
1055
1175
|
col: Column = None
|
|
1056
1176
|
if isinstance(column, str):
|
|
1057
|
-
col = self._tbl_version_path.get_column(column
|
|
1177
|
+
col = self._tbl_version_path.get_column(column)
|
|
1058
1178
|
if col is None:
|
|
1059
|
-
raise excs.Error(f'
|
|
1179
|
+
raise excs.Error(f'Unknown column: {column}')
|
|
1060
1180
|
elif isinstance(column, ColumnRef):
|
|
1061
|
-
exists = self._tbl_version_path.has_column(column.col
|
|
1181
|
+
exists = self._tbl_version_path.has_column(column.col)
|
|
1062
1182
|
if not exists:
|
|
1063
1183
|
raise excs.Error(f'Unknown column: {column.col.qualified_name}')
|
|
1064
1184
|
col = column.col
|
|
@@ -1069,8 +1189,8 @@ class Table(SchemaObject):
|
|
|
1069
1189
|
def drop_index(
|
|
1070
1190
|
self,
|
|
1071
1191
|
*,
|
|
1072
|
-
column:
|
|
1073
|
-
idx_name:
|
|
1192
|
+
column: str | ColumnRef | None = None,
|
|
1193
|
+
idx_name: str | None = None,
|
|
1074
1194
|
if_not_exists: Literal['error', 'ignore'] = 'error',
|
|
1075
1195
|
) -> None:
|
|
1076
1196
|
"""
|
|
@@ -1116,27 +1236,30 @@ class Table(SchemaObject):
|
|
|
1116
1236
|
... tbl.drop_index(idx_name='idx1', if_not_exists='ignore')
|
|
1117
1237
|
|
|
1118
1238
|
"""
|
|
1239
|
+
from pixeltable.catalog import Catalog
|
|
1240
|
+
|
|
1119
1241
|
if (column is None) == (idx_name is None):
|
|
1120
1242
|
raise excs.Error("Exactly one of 'column' or 'idx_name' must be provided")
|
|
1121
1243
|
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1244
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=False):
|
|
1245
|
+
col: Column = None
|
|
1246
|
+
if idx_name is None:
|
|
1247
|
+
col = self._resolve_column_parameter(column)
|
|
1248
|
+
assert col is not None
|
|
1126
1249
|
|
|
1127
|
-
with Env.get().begin_xact():
|
|
1128
1250
|
self._drop_index(col=col, idx_name=idx_name, if_not_exists=if_not_exists)
|
|
1129
1251
|
|
|
1130
1252
|
def _drop_index(
|
|
1131
1253
|
self,
|
|
1132
1254
|
*,
|
|
1133
|
-
col:
|
|
1134
|
-
idx_name:
|
|
1135
|
-
_idx_class:
|
|
1255
|
+
col: Column | None = None,
|
|
1256
|
+
idx_name: str | None = None,
|
|
1257
|
+
_idx_class: type[index.IndexBase] | None = None,
|
|
1136
1258
|
if_not_exists: Literal['error', 'ignore'] = 'error',
|
|
1137
1259
|
) -> None:
|
|
1138
|
-
|
|
1139
|
-
|
|
1260
|
+
from pixeltable.catalog import Catalog
|
|
1261
|
+
|
|
1262
|
+
self.__check_mutable('drop an index from')
|
|
1140
1263
|
assert (col is None) != (idx_name is None)
|
|
1141
1264
|
|
|
1142
1265
|
if idx_name is not None:
|
|
@@ -1148,9 +1271,10 @@ class Table(SchemaObject):
|
|
|
1148
1271
|
return
|
|
1149
1272
|
idx_info = self._tbl_version.get().idxs_by_name[idx_name]
|
|
1150
1273
|
else:
|
|
1151
|
-
if col.
|
|
1274
|
+
if col.get_tbl().id != self._tbl_version.id:
|
|
1152
1275
|
raise excs.Error(
|
|
1153
|
-
f'Column {col.name!r}:
|
|
1276
|
+
f'Column {col.name!r}: '
|
|
1277
|
+
f'cannot drop index from column that belongs to base table {col.get_tbl().name!r}'
|
|
1154
1278
|
)
|
|
1155
1279
|
idx_info_list = [info for info in self._tbl_version.get().idxs_by_name.values() if info.col.id == col.id]
|
|
1156
1280
|
if _idx_class is not None:
|
|
@@ -1162,14 +1286,17 @@ class Table(SchemaObject):
|
|
|
1162
1286
|
assert if_not_exists_ == IfNotExistsParam.IGNORE
|
|
1163
1287
|
return
|
|
1164
1288
|
if len(idx_info_list) > 1:
|
|
1165
|
-
raise excs.Error(f
|
|
1289
|
+
raise excs.Error(f'Column {col.name!r} has multiple indices; specify `idx_name` explicitly to drop one')
|
|
1166
1290
|
idx_info = idx_info_list[0]
|
|
1167
1291
|
|
|
1168
1292
|
# Find out if anything depends on this index
|
|
1169
|
-
|
|
1293
|
+
val_col = idx_info.val_col
|
|
1294
|
+
dependent_user_cols = [
|
|
1295
|
+
c for c in Catalog.get().get_column_dependents(val_col.get_tbl().id, val_col.id) if c.name is not None
|
|
1296
|
+
]
|
|
1170
1297
|
if len(dependent_user_cols) > 0:
|
|
1171
1298
|
raise excs.Error(
|
|
1172
|
-
f'Cannot drop index because the following columns depend on it:\n'
|
|
1299
|
+
f'Cannot drop index {idx_info.name!r} because the following columns depend on it:\n'
|
|
1173
1300
|
f'{", ".join(c.name for c in dependent_user_cols)}'
|
|
1174
1301
|
)
|
|
1175
1302
|
self._tbl_version.get().drop_index(idx_info.id)
|
|
@@ -1180,8 +1307,8 @@ class Table(SchemaObject):
|
|
|
1180
1307
|
source: TableDataSource,
|
|
1181
1308
|
/,
|
|
1182
1309
|
*,
|
|
1183
|
-
source_format:
|
|
1184
|
-
schema_overrides:
|
|
1310
|
+
source_format: Literal['csv', 'excel', 'parquet', 'json'] | None = None,
|
|
1311
|
+
schema_overrides: dict[str, ts.ColumnType] | None = None,
|
|
1185
1312
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
1186
1313
|
print_stats: bool = False,
|
|
1187
1314
|
**kwargs: Any,
|
|
@@ -1195,11 +1322,11 @@ class Table(SchemaObject):
|
|
|
1195
1322
|
@abc.abstractmethod
|
|
1196
1323
|
def insert(
|
|
1197
1324
|
self,
|
|
1198
|
-
source:
|
|
1325
|
+
source: TableDataSource | None = None,
|
|
1199
1326
|
/,
|
|
1200
1327
|
*,
|
|
1201
|
-
source_format:
|
|
1202
|
-
schema_overrides:
|
|
1328
|
+
source_format: Literal['csv', 'excel', 'parquet', 'json'] | None = None,
|
|
1329
|
+
schema_overrides: dict[str, ts.ColumnType] | None = None,
|
|
1203
1330
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
1204
1331
|
print_stats: bool = False,
|
|
1205
1332
|
**kwargs: Any,
|
|
@@ -1216,7 +1343,8 @@ class Table(SchemaObject):
|
|
|
1216
1343
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
1217
1344
|
print_stats: bool = False,
|
|
1218
1345
|
**kwargs: Any,
|
|
1219
|
-
)
|
|
1346
|
+
)
|
|
1347
|
+
```
|
|
1220
1348
|
|
|
1221
1349
|
To insert just a single row, you can use the more concise syntax:
|
|
1222
1350
|
|
|
@@ -1226,7 +1354,8 @@ class Table(SchemaObject):
|
|
|
1226
1354
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
1227
1355
|
print_stats: bool = False,
|
|
1228
1356
|
**kwargs: Any
|
|
1229
|
-
)
|
|
1357
|
+
)
|
|
1358
|
+
```
|
|
1230
1359
|
|
|
1231
1360
|
Args:
|
|
1232
1361
|
source: A data source from which data can be imported.
|
|
@@ -1269,11 +1398,20 @@ class Table(SchemaObject):
|
|
|
1269
1398
|
Insert rows from a CSV file:
|
|
1270
1399
|
|
|
1271
1400
|
>>> tbl.insert(source='path/to/file.csv')
|
|
1401
|
+
|
|
1402
|
+
Insert Pydantic model instances into a table with two `pxt.Int` columns `a` and `b`:
|
|
1403
|
+
|
|
1404
|
+
>>> class MyModel(pydantic.BaseModel):
|
|
1405
|
+
... a: int
|
|
1406
|
+
... b: int
|
|
1407
|
+
...
|
|
1408
|
+
... models = [MyModel(a=1, b=2), MyModel(a=3, b=4)]
|
|
1409
|
+
... tbl.insert(models)
|
|
1272
1410
|
"""
|
|
1273
1411
|
raise NotImplementedError
|
|
1274
1412
|
|
|
1275
1413
|
def update(
|
|
1276
|
-
self, value_spec: dict[str, Any], where:
|
|
1414
|
+
self, value_spec: dict[str, Any], where: 'exprs.Expr' | None = None, cascade: bool = True
|
|
1277
1415
|
) -> UpdateStatus:
|
|
1278
1416
|
"""Update rows in this table.
|
|
1279
1417
|
|
|
@@ -1282,6 +1420,9 @@ class Table(SchemaObject):
|
|
|
1282
1420
|
where: a predicate to filter rows to update.
|
|
1283
1421
|
cascade: if True, also update all computed columns that transitively depend on the updated columns.
|
|
1284
1422
|
|
|
1423
|
+
Returns:
|
|
1424
|
+
An [`UpdateStatus`][pixeltable.UpdateStatus] object containing information about the update.
|
|
1425
|
+
|
|
1285
1426
|
Examples:
|
|
1286
1427
|
Set column `int_col` to 1 for all rows:
|
|
1287
1428
|
|
|
@@ -1299,10 +1440,13 @@ class Table(SchemaObject):
|
|
|
1299
1440
|
|
|
1300
1441
|
>>> tbl.update({'int_col': tbl.int_col + 1}, where=tbl.int_col == 0)
|
|
1301
1442
|
"""
|
|
1302
|
-
|
|
1303
|
-
|
|
1443
|
+
from pixeltable.catalog import Catalog
|
|
1444
|
+
|
|
1445
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
1446
|
+
self.__check_mutable('update')
|
|
1447
|
+
result = self._tbl_version.get().update(value_spec, where, cascade)
|
|
1304
1448
|
FileCache.get().emit_eviction_warnings()
|
|
1305
|
-
return
|
|
1449
|
+
return result
|
|
1306
1450
|
|
|
1307
1451
|
def batch_update(
|
|
1308
1452
|
self,
|
|
@@ -1326,45 +1470,51 @@ class Table(SchemaObject):
|
|
|
1326
1470
|
Update the `name` and `age` columns for the rows with ids 1 and 2 (assuming `id` is the primary key).
|
|
1327
1471
|
If either row does not exist, this raises an error:
|
|
1328
1472
|
|
|
1329
|
-
>>> tbl.
|
|
1473
|
+
>>> tbl.batch_update(
|
|
1474
|
+
... [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 2, 'name': 'Bob', 'age': 40}]
|
|
1475
|
+
... )
|
|
1330
1476
|
|
|
1331
1477
|
Update the `name` and `age` columns for the row with `id` 1 (assuming `id` is the primary key) and insert
|
|
1332
1478
|
the row with new `id` 3 (assuming this key does not exist):
|
|
1333
1479
|
|
|
1334
|
-
>>> tbl.
|
|
1335
|
-
|
|
1336
|
-
|
|
1480
|
+
>>> tbl.batch_update(
|
|
1481
|
+
... [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 3, 'name': 'Bob', 'age': 40}],
|
|
1482
|
+
... if_not_exists='insert'
|
|
1483
|
+
... )
|
|
1337
1484
|
"""
|
|
1338
|
-
|
|
1339
|
-
raise excs.Error('Cannot update a snapshot')
|
|
1340
|
-
rows = list(rows)
|
|
1485
|
+
from pixeltable.catalog import Catalog
|
|
1341
1486
|
|
|
1342
|
-
|
|
1343
|
-
|
|
1487
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
1488
|
+
self.__check_mutable('update')
|
|
1489
|
+
rows = list(rows)
|
|
1344
1490
|
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
rowids: list[tuple[int, ...]] = []
|
|
1348
|
-
if len(pk_col_names) == 0 and not has_rowid:
|
|
1349
|
-
raise excs.Error('Table must have primary key for batch update')
|
|
1491
|
+
row_updates: list[dict[Column, exprs.Expr]] = []
|
|
1492
|
+
pk_col_names = {c.name for c in self._tbl_version.get().primary_key_columns()}
|
|
1350
1493
|
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
)
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
if
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1494
|
+
# pseudo-column _rowid: contains the rowid of the row to update and can be used instead of the primary key
|
|
1495
|
+
has_rowid = _ROWID_COLUMN_NAME in rows[0]
|
|
1496
|
+
rowids: list[tuple[int, ...]] = []
|
|
1497
|
+
if len(pk_col_names) == 0 and not has_rowid:
|
|
1498
|
+
raise excs.Error('Table must have primary key for batch update')
|
|
1499
|
+
|
|
1500
|
+
for row_spec in rows:
|
|
1501
|
+
col_vals = self._tbl_version.get()._validate_update_spec(
|
|
1502
|
+
row_spec, allow_pk=not has_rowid, allow_exprs=False, allow_media=False
|
|
1503
|
+
)
|
|
1504
|
+
if has_rowid:
|
|
1505
|
+
# we expect the _rowid column to be present for each row
|
|
1506
|
+
assert _ROWID_COLUMN_NAME in row_spec
|
|
1507
|
+
rowids.append(row_spec[_ROWID_COLUMN_NAME])
|
|
1508
|
+
else:
|
|
1509
|
+
col_names = {col.name for col in col_vals}
|
|
1510
|
+
if any(pk_col_name not in col_names for pk_col_name in pk_col_names):
|
|
1511
|
+
missing_cols = pk_col_names - {col.name for col in col_vals}
|
|
1512
|
+
raise excs.Error(
|
|
1513
|
+
f'Primary key column(s) {", ".join(repr(c) for c in missing_cols)} missing in {row_spec}'
|
|
1514
|
+
)
|
|
1515
|
+
row_updates.append(col_vals)
|
|
1516
|
+
|
|
1517
|
+
result = self._tbl_version.get().batch_update(
|
|
1368
1518
|
row_updates,
|
|
1369
1519
|
rowids,
|
|
1370
1520
|
error_if_not_exists=if_not_exists == 'error',
|
|
@@ -1372,9 +1522,85 @@ class Table(SchemaObject):
|
|
|
1372
1522
|
cascade=cascade,
|
|
1373
1523
|
)
|
|
1374
1524
|
FileCache.get().emit_eviction_warnings()
|
|
1375
|
-
return
|
|
1525
|
+
return result
|
|
1526
|
+
|
|
1527
|
+
def recompute_columns(
|
|
1528
|
+
self,
|
|
1529
|
+
*columns: str | ColumnRef,
|
|
1530
|
+
where: 'exprs.Expr' | None = None,
|
|
1531
|
+
errors_only: bool = False,
|
|
1532
|
+
cascade: bool = True,
|
|
1533
|
+
) -> UpdateStatus:
|
|
1534
|
+
"""Recompute the values in one or more computed columns of this table.
|
|
1535
|
+
|
|
1536
|
+
Args:
|
|
1537
|
+
columns: The names or references of the computed columns to recompute.
|
|
1538
|
+
where: A predicate to filter rows to recompute.
|
|
1539
|
+
errors_only: If True, only run the recomputation for rows that have errors in the column (ie, the column's
|
|
1540
|
+
`errortype` property indicates that an error occurred). Only allowed for recomputing a single column.
|
|
1541
|
+
cascade: if True, also update all computed columns that transitively depend on the recomputed columns.
|
|
1542
|
+
|
|
1543
|
+
Examples:
|
|
1544
|
+
Recompute computed columns `c1` and `c2` for all rows in this table, and everything that transitively
|
|
1545
|
+
depends on them:
|
|
1546
|
+
|
|
1547
|
+
>>> tbl.recompute_columns('c1', 'c2')
|
|
1548
|
+
|
|
1549
|
+
Recompute computed column `c1` for all rows in this table, but don't recompute other columns that depend on
|
|
1550
|
+
it:
|
|
1551
|
+
|
|
1552
|
+
>>> tbl.recompute_columns(tbl.c1, tbl.c2, cascade=False)
|
|
1553
|
+
|
|
1554
|
+
Recompute column `c1` and its dependents, but only for rows with `c2` == 0:
|
|
1555
|
+
|
|
1556
|
+
>>> tbl.recompute_columns('c1', where=tbl.c2 == 0)
|
|
1557
|
+
|
|
1558
|
+
Recompute column `c1` and its dependents, but only for rows that have errors in it:
|
|
1559
|
+
|
|
1560
|
+
>>> tbl.recompute_columns('c1', errors_only=True)
|
|
1561
|
+
"""
|
|
1562
|
+
from pixeltable.catalog import Catalog
|
|
1563
|
+
|
|
1564
|
+
cat = Catalog.get()
|
|
1565
|
+
# lock_mutable_tree=True: we need to be able to see whether any transitive view has column dependents
|
|
1566
|
+
with cat.begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
1567
|
+
self.__check_mutable('recompute columns of')
|
|
1568
|
+
if len(columns) == 0:
|
|
1569
|
+
raise excs.Error('At least one column must be specified to recompute')
|
|
1570
|
+
if errors_only and len(columns) > 1:
|
|
1571
|
+
raise excs.Error('Cannot use errors_only=True with multiple columns')
|
|
1572
|
+
|
|
1573
|
+
col_names: list[str] = []
|
|
1574
|
+
for column in columns:
|
|
1575
|
+
col_name: str
|
|
1576
|
+
col: Column
|
|
1577
|
+
if isinstance(column, str):
|
|
1578
|
+
col = self._tbl_version_path.get_column(column)
|
|
1579
|
+
if col is None:
|
|
1580
|
+
raise excs.Error(f'Unknown column: {column}')
|
|
1581
|
+
col_name = column
|
|
1582
|
+
else:
|
|
1583
|
+
assert isinstance(column, ColumnRef)
|
|
1584
|
+
col = column.col
|
|
1585
|
+
if not self._tbl_version_path.has_column(col):
|
|
1586
|
+
raise excs.Error(f'Unknown column: {col.name}')
|
|
1587
|
+
col_name = col.name
|
|
1588
|
+
if not col.is_computed:
|
|
1589
|
+
raise excs.Error(f'Column {col_name!r} is not a computed column')
|
|
1590
|
+
if col.get_tbl().id != self._tbl_version_path.tbl_id:
|
|
1591
|
+
raise excs.Error(f'Cannot recompute column of a base: {col_name}')
|
|
1592
|
+
col_names.append(col_name)
|
|
1593
|
+
|
|
1594
|
+
if where is not None and not where.is_bound_by([self._tbl_version_path]):
|
|
1595
|
+
raise excs.Error(f'`where` predicate ({where}) is not bound by {self._display_str()}')
|
|
1596
|
+
|
|
1597
|
+
result = self._tbl_version.get().recompute_columns(
|
|
1598
|
+
col_names, where=where, errors_only=errors_only, cascade=cascade
|
|
1599
|
+
)
|
|
1600
|
+
FileCache.get().emit_eviction_warnings()
|
|
1601
|
+
return result
|
|
1376
1602
|
|
|
1377
|
-
def delete(self, where:
|
|
1603
|
+
def delete(self, where: 'exprs.Expr' | None = None) -> UpdateStatus:
|
|
1378
1604
|
"""Delete rows in this table.
|
|
1379
1605
|
|
|
1380
1606
|
Args:
|
|
@@ -1397,12 +1623,75 @@ class Table(SchemaObject):
|
|
|
1397
1623
|
.. warning::
|
|
1398
1624
|
This operation is irreversible.
|
|
1399
1625
|
"""
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
with Env.get().begin_xact():
|
|
1626
|
+
with catalog.Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
1627
|
+
self.__check_mutable('revert')
|
|
1403
1628
|
self._tbl_version.get().revert()
|
|
1629
|
+
# remove cached md in order to force a reload on the next operation
|
|
1630
|
+
self._tbl_version_path.clear_cached_md()
|
|
1631
|
+
|
|
1632
|
+
def push(self) -> None:
|
|
1633
|
+
from pixeltable.share import push_replica
|
|
1634
|
+
from pixeltable.share.protocol import PxtUri
|
|
1635
|
+
|
|
1636
|
+
pxt_uri = self._get_pxt_uri()
|
|
1637
|
+
tbl_version = self._tbl_version_path.tbl_version.get()
|
|
1638
|
+
|
|
1639
|
+
if tbl_version.is_replica:
|
|
1640
|
+
raise excs.Error(f'push(): Cannot push replica table {self._name!r}. (Did you mean `pull()`?)')
|
|
1641
|
+
|
|
1642
|
+
if pxt_uri is None:
|
|
1643
|
+
raise excs.Error(
|
|
1644
|
+
f'push(): Table {self._name!r} has not yet been published to Pixeltable Cloud. '
|
|
1645
|
+
'To publish it, use `pxt.publish()` instead.'
|
|
1646
|
+
)
|
|
1647
|
+
|
|
1648
|
+
if isinstance(self, catalog.View) and self._is_anonymous_snapshot():
|
|
1649
|
+
raise excs.Error(
|
|
1650
|
+
f'push(): Cannot push specific-version table handle {tbl_version.versioned_name!r}. '
|
|
1651
|
+
'To push the latest version instead:\n'
|
|
1652
|
+
f' t = pxt.get_table({self._name!r})\n'
|
|
1653
|
+
f' t.push()'
|
|
1654
|
+
)
|
|
1655
|
+
|
|
1656
|
+
if self._tbl_version is None:
|
|
1657
|
+
# Named snapshots never have new versions to push.
|
|
1658
|
+
env.Env.get().console_logger.info('push(): Everything up to date.')
|
|
1659
|
+
return
|
|
1660
|
+
|
|
1661
|
+
# Parse the pxt URI to extract org/db and create a UUID-based URI for pushing
|
|
1662
|
+
parsed_uri = PxtUri(uri=pxt_uri)
|
|
1663
|
+
uuid_uri_obj = PxtUri.from_components(org=parsed_uri.org, id=self._id, db=parsed_uri.db)
|
|
1664
|
+
uuid_uri = str(uuid_uri_obj)
|
|
1665
|
+
|
|
1666
|
+
push_replica(uuid_uri, self)
|
|
1667
|
+
|
|
1668
|
+
def pull(self) -> None:
|
|
1669
|
+
from pixeltable.share import pull_replica
|
|
1670
|
+
from pixeltable.share.protocol import PxtUri
|
|
1671
|
+
|
|
1672
|
+
pxt_uri = self._get_pxt_uri()
|
|
1673
|
+
tbl_version = self._tbl_version_path.tbl_version.get()
|
|
1674
|
+
|
|
1675
|
+
if not tbl_version.is_replica or pxt_uri is None:
|
|
1676
|
+
raise excs.Error(
|
|
1677
|
+
f'pull(): Table {self._name!r} is not a replica of a Pixeltable Cloud table (nothing to `pull()`).'
|
|
1678
|
+
)
|
|
1679
|
+
|
|
1680
|
+
if isinstance(self, catalog.View) and self._is_anonymous_snapshot():
|
|
1681
|
+
raise excs.Error(
|
|
1682
|
+
f'pull(): Cannot pull specific-version table handle {tbl_version.versioned_name!r}. '
|
|
1683
|
+
'To pull the latest version instead:\n'
|
|
1684
|
+
f' t = pxt.get_table({self._name!r})\n'
|
|
1685
|
+
f' t.pull()'
|
|
1686
|
+
)
|
|
1687
|
+
|
|
1688
|
+
# Parse the pxt URI to extract org/db and create a UUID-based URI for pulling
|
|
1689
|
+
parsed_uri = PxtUri(uri=pxt_uri)
|
|
1690
|
+
uuid_uri_obj = PxtUri.from_components(org=parsed_uri.org, id=self._id, db=parsed_uri.db)
|
|
1691
|
+
uuid_uri = str(uuid_uri_obj)
|
|
1692
|
+
|
|
1693
|
+
pull_replica(self._path(), uuid_uri)
|
|
1404
1694
|
|
|
1405
|
-
@property
|
|
1406
1695
|
def external_stores(self) -> list[str]:
|
|
1407
1696
|
return list(self._tbl_version.get().external_stores.keys())
|
|
1408
1697
|
|
|
@@ -1410,21 +1699,20 @@ class Table(SchemaObject):
|
|
|
1410
1699
|
"""
|
|
1411
1700
|
Links the specified `ExternalStore` to this table.
|
|
1412
1701
|
"""
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1702
|
+
from pixeltable.catalog import Catalog
|
|
1703
|
+
|
|
1704
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=False):
|
|
1705
|
+
self.__check_mutable('link an external store to')
|
|
1706
|
+
if store.name in self.external_stores():
|
|
1707
|
+
raise excs.Error(f'Table {self._name!r} already has an external store with that name: {store.name}')
|
|
1708
|
+
_logger.info(f'Linking external store {store.name!r} to table {self._name!r}.')
|
|
1709
|
+
|
|
1710
|
+
store.link(self._tbl_version.get()) # might call tbl_version.add_columns()
|
|
1419
1711
|
self._tbl_version.get().link_external_store(store)
|
|
1420
|
-
env.Env.get().console_logger.info(f'Linked external store
|
|
1712
|
+
env.Env.get().console_logger.info(f'Linked external store {store.name!r} to table {self._name!r}.')
|
|
1421
1713
|
|
|
1422
1714
|
def unlink_external_stores(
|
|
1423
|
-
self,
|
|
1424
|
-
stores: Optional[str | list[str]] = None,
|
|
1425
|
-
*,
|
|
1426
|
-
delete_external_data: bool = False,
|
|
1427
|
-
ignore_errors: bool = False,
|
|
1715
|
+
self, stores: str | list[str] | None = None, *, delete_external_data: bool = False, ignore_errors: bool = False
|
|
1428
1716
|
) -> None:
|
|
1429
1717
|
"""
|
|
1430
1718
|
Unlinks this table's external stores.
|
|
@@ -1437,28 +1725,37 @@ class Table(SchemaObject):
|
|
|
1437
1725
|
delete_external_data (bool): If `True`, then the external data store will also be deleted. WARNING: This
|
|
1438
1726
|
is a destructive operation that will delete data outside Pixeltable, and cannot be undone.
|
|
1439
1727
|
"""
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1728
|
+
from pixeltable.catalog import Catalog
|
|
1729
|
+
|
|
1730
|
+
if not self._tbl_version_path.is_mutable():
|
|
1731
|
+
return
|
|
1732
|
+
with Catalog.get().begin_xact(tbl=self._tbl_version_path, for_write=True, lock_mutable_tree=False):
|
|
1733
|
+
all_stores = self.external_stores()
|
|
1734
|
+
|
|
1735
|
+
if stores is None:
|
|
1736
|
+
stores = all_stores
|
|
1737
|
+
elif isinstance(stores, str):
|
|
1738
|
+
stores = [stores]
|
|
1739
|
+
|
|
1740
|
+
# Validation
|
|
1741
|
+
if not ignore_errors:
|
|
1742
|
+
for store_name in stores:
|
|
1743
|
+
if store_name not in all_stores:
|
|
1744
|
+
raise excs.Error(f'Table {self._name!r} has no external store with that name: {store_name}')
|
|
1745
|
+
|
|
1746
|
+
for store_name in stores:
|
|
1747
|
+
store = self._tbl_version.get().external_stores[store_name]
|
|
1748
|
+
# get hold of the store's debug string before deleting it
|
|
1749
|
+
store_str = str(store)
|
|
1750
|
+
store.unlink(self._tbl_version.get()) # might call tbl_version.drop_columns()
|
|
1751
|
+
self._tbl_version.get().unlink_external_store(store)
|
|
1752
|
+
if delete_external_data and isinstance(store, pxt.io.external_store.Project):
|
|
1753
|
+
store.delete()
|
|
1754
|
+
env.Env.get().console_logger.info(f'Unlinked external store from table {self._name!r}: {store_str}')
|
|
1458
1755
|
|
|
1459
1756
|
def sync(
|
|
1460
|
-
self, stores:
|
|
1461
|
-
) ->
|
|
1757
|
+
self, stores: str | list[str] | None = None, *, export_data: bool = True, import_data: bool = True
|
|
1758
|
+
) -> UpdateStatus:
|
|
1462
1759
|
"""
|
|
1463
1760
|
Synchronizes this table with its linked external stores.
|
|
1464
1761
|
|
|
@@ -1468,29 +1765,139 @@ class Table(SchemaObject):
|
|
|
1468
1765
|
export_data: If `True`, data from this table will be exported to the external stores during synchronization.
|
|
1469
1766
|
import_data: If `True`, data from the external stores will be imported to this table during synchronization.
|
|
1470
1767
|
"""
|
|
1471
|
-
|
|
1472
|
-
|
|
1768
|
+
from pixeltable.catalog import Catalog
|
|
1769
|
+
|
|
1770
|
+
if not self._tbl_version_path.is_mutable():
|
|
1771
|
+
return UpdateStatus()
|
|
1772
|
+
# we lock the entire tree starting at the root base table in order to ensure that all synced columns can
|
|
1773
|
+
# have their updates propagated down the tree
|
|
1774
|
+
base_tv = self._tbl_version_path.get_tbl_versions()[-1]
|
|
1775
|
+
with Catalog.get().begin_xact(tbl=TableVersionPath(base_tv), for_write=True, lock_mutable_tree=True):
|
|
1776
|
+
all_stores = self.external_stores()
|
|
1473
1777
|
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1778
|
+
if stores is None:
|
|
1779
|
+
stores = all_stores
|
|
1780
|
+
elif isinstance(stores, str):
|
|
1781
|
+
stores = [stores]
|
|
1478
1782
|
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1783
|
+
for store in stores:
|
|
1784
|
+
if store not in all_stores:
|
|
1785
|
+
raise excs.Error(f'Table {self._name!r} has no external store with that name: {store}')
|
|
1482
1786
|
|
|
1483
|
-
|
|
1484
|
-
with Env.get().begin_xact():
|
|
1787
|
+
sync_status = UpdateStatus()
|
|
1485
1788
|
for store in stores:
|
|
1486
1789
|
store_obj = self._tbl_version.get().external_stores[store]
|
|
1487
1790
|
store_sync_status = store_obj.sync(self, export_data=export_data, import_data=import_data)
|
|
1488
|
-
sync_status
|
|
1791
|
+
sync_status += store_sync_status
|
|
1489
1792
|
|
|
1490
1793
|
return sync_status
|
|
1491
1794
|
|
|
1492
1795
|
def __dir__(self) -> list[str]:
|
|
1493
|
-
return list(super().__dir__()) + list(self.
|
|
1796
|
+
return list(super().__dir__()) + list(self._get_schema().keys())
|
|
1494
1797
|
|
|
1495
1798
|
def _ipython_key_completions_(self) -> list[str]:
|
|
1496
|
-
return list(self.
|
|
1799
|
+
return list(self._get_schema().keys())
|
|
1800
|
+
|
|
1801
|
+
def get_versions(self, n: int | None = None) -> list[VersionMetadata]:
|
|
1802
|
+
"""
|
|
1803
|
+
Returns information about versions of this table, most recent first.
|
|
1804
|
+
|
|
1805
|
+
`get_versions()` is intended for programmatic access to version metadata; for human-readable
|
|
1806
|
+
output, use [`history()`][pixeltable.Table.history] instead.
|
|
1807
|
+
|
|
1808
|
+
Args:
|
|
1809
|
+
n: if specified, will return at most `n` versions
|
|
1810
|
+
|
|
1811
|
+
Returns:
|
|
1812
|
+
A list of [VersionMetadata][pixeltable.VersionMetadata] dictionaries, one per version retrieved, most
|
|
1813
|
+
recent first.
|
|
1814
|
+
|
|
1815
|
+
Examples:
|
|
1816
|
+
Retrieve metadata about all versions of the table `tbl`:
|
|
1817
|
+
|
|
1818
|
+
>>> tbl.get_versions()
|
|
1819
|
+
|
|
1820
|
+
Retrieve metadata about the most recent 5 versions of the table `tbl`:
|
|
1821
|
+
|
|
1822
|
+
>>> tbl.get_versions(n=5)
|
|
1823
|
+
"""
|
|
1824
|
+
from pixeltable.catalog import Catalog
|
|
1825
|
+
|
|
1826
|
+
if n is None:
|
|
1827
|
+
n = 1_000_000_000
|
|
1828
|
+
if not isinstance(n, int) or n < 1:
|
|
1829
|
+
raise excs.Error(f'Invalid value for `n`: {n}')
|
|
1830
|
+
|
|
1831
|
+
# Retrieve the table history components from the catalog
|
|
1832
|
+
tbl_id = self._id
|
|
1833
|
+
# Collect an extra version, if available, to allow for computation of the first version's schema change
|
|
1834
|
+
vers_list = Catalog.get().collect_tbl_history(tbl_id, n + 1)
|
|
1835
|
+
|
|
1836
|
+
# Construct the metadata change description dictionary
|
|
1837
|
+
md_list = [(vers_md.version_md.version, vers_md.schema_version_md.columns) for vers_md in vers_list]
|
|
1838
|
+
md_dict = MetadataUtils._create_md_change_dict(md_list)
|
|
1839
|
+
|
|
1840
|
+
# Construct report lines
|
|
1841
|
+
if len(vers_list) > n:
|
|
1842
|
+
assert len(vers_list) == n + 1
|
|
1843
|
+
over_count = 1
|
|
1844
|
+
else:
|
|
1845
|
+
over_count = 0
|
|
1846
|
+
|
|
1847
|
+
metadata_dicts: list[VersionMetadata] = []
|
|
1848
|
+
for vers_md in vers_list[0 : len(vers_list) - over_count]:
|
|
1849
|
+
version = vers_md.version_md.version
|
|
1850
|
+
schema_change = md_dict.get(version, None)
|
|
1851
|
+
update_status = vers_md.version_md.update_status
|
|
1852
|
+
if update_status is None:
|
|
1853
|
+
update_status = UpdateStatus()
|
|
1854
|
+
change_type: Literal['schema', 'data'] = 'schema' if schema_change is not None else 'data'
|
|
1855
|
+
rcs = update_status.row_count_stats + update_status.cascade_row_count_stats
|
|
1856
|
+
metadata_dicts.append(
|
|
1857
|
+
VersionMetadata(
|
|
1858
|
+
version=version,
|
|
1859
|
+
created_at=datetime.datetime.fromtimestamp(vers_md.version_md.created_at, tz=datetime.timezone.utc),
|
|
1860
|
+
user=vers_md.version_md.user,
|
|
1861
|
+
change_type=change_type,
|
|
1862
|
+
inserts=rcs.ins_rows,
|
|
1863
|
+
updates=rcs.upd_rows,
|
|
1864
|
+
deletes=rcs.del_rows,
|
|
1865
|
+
errors=rcs.num_excs,
|
|
1866
|
+
computed=rcs.computed_values,
|
|
1867
|
+
schema_change=schema_change,
|
|
1868
|
+
)
|
|
1869
|
+
)
|
|
1870
|
+
|
|
1871
|
+
return metadata_dicts
|
|
1872
|
+
|
|
1873
|
+
def history(self, n: int | None = None) -> pd.DataFrame:
|
|
1874
|
+
"""
|
|
1875
|
+
Returns a human-readable report about versions of this table.
|
|
1876
|
+
|
|
1877
|
+
`history()` is intended for human-readable output of version metadata; for programmatic access,
|
|
1878
|
+
use [`get_versions()`][pixeltable.Table.get_versions] instead.
|
|
1879
|
+
|
|
1880
|
+
Args:
|
|
1881
|
+
n: if specified, will return at most `n` versions
|
|
1882
|
+
|
|
1883
|
+
Returns:
|
|
1884
|
+
A report with information about each version, one per row, most recent first.
|
|
1885
|
+
|
|
1886
|
+
Examples:
|
|
1887
|
+
Report all versions of the table:
|
|
1888
|
+
|
|
1889
|
+
>>> tbl.history()
|
|
1890
|
+
|
|
1891
|
+
Report only the most recent 5 changes to the table:
|
|
1892
|
+
|
|
1893
|
+
>>> tbl.history(n=5)
|
|
1894
|
+
"""
|
|
1895
|
+
versions = self.get_versions(n)
|
|
1896
|
+
assert len(versions) > 0
|
|
1897
|
+
return pd.DataFrame([list(v.values()) for v in versions], columns=list(versions[0].keys()))
|
|
1898
|
+
|
|
1899
|
+
def __check_mutable(self, op_descr: str) -> None:
|
|
1900
|
+
if self._tbl_version_path.is_replica():
|
|
1901
|
+
raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a replica.')
|
|
1902
|
+
if self._tbl_version_path.is_snapshot():
|
|
1903
|
+
raise excs.Error(f'{self._display_str()}: Cannot {op_descr} a snapshot.')
|