pixeltable 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +509 -103
- pixeltable/catalog/column.py +5 -0
- pixeltable/catalog/dir.py +15 -6
- pixeltable/catalog/globals.py +16 -0
- pixeltable/catalog/insertable_table.py +82 -41
- pixeltable/catalog/path.py +15 -0
- pixeltable/catalog/schema_object.py +7 -12
- pixeltable/catalog/table.py +81 -67
- pixeltable/catalog/table_version.py +23 -7
- pixeltable/catalog/view.py +9 -6
- pixeltable/env.py +15 -9
- pixeltable/exec/exec_node.py +1 -1
- pixeltable/exprs/__init__.py +2 -1
- pixeltable/exprs/arithmetic_expr.py +2 -0
- pixeltable/exprs/column_ref.py +38 -2
- pixeltable/exprs/expr.py +61 -12
- pixeltable/exprs/function_call.py +1 -4
- pixeltable/exprs/globals.py +12 -0
- pixeltable/exprs/json_mapper.py +4 -4
- pixeltable/exprs/json_path.py +10 -11
- pixeltable/exprs/similarity_expr.py +5 -20
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/ext/functions/yolox.py +21 -64
- pixeltable/func/callable_function.py +5 -2
- pixeltable/func/query_template_function.py +6 -18
- pixeltable/func/tools.py +2 -2
- pixeltable/functions/__init__.py +1 -1
- pixeltable/functions/globals.py +16 -5
- pixeltable/globals.py +172 -262
- pixeltable/io/__init__.py +3 -2
- pixeltable/io/datarows.py +138 -0
- pixeltable/io/external_store.py +8 -5
- pixeltable/io/globals.py +7 -160
- pixeltable/io/hf_datasets.py +21 -98
- pixeltable/io/pandas.py +29 -43
- pixeltable/io/parquet.py +17 -42
- pixeltable/io/table_data_conduit.py +569 -0
- pixeltable/io/utils.py +6 -21
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_30.py +50 -0
- pixeltable/metadata/converters/util.py +26 -1
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +3 -0
- pixeltable/utils/arrow.py +32 -7
- pixeltable/utils/coroutine.py +41 -0
- {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/METADATA +1 -1
- {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/RECORD +52 -47
- {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/WHEEL +1 -1
- {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/entry_points.txt +0 -0
pixeltable/catalog/column.py
CHANGED
|
@@ -202,6 +202,11 @@ class Column:
|
|
|
202
202
|
assert self.tbl is not None
|
|
203
203
|
return self.tbl.get().media_validation
|
|
204
204
|
|
|
205
|
+
@property
|
|
206
|
+
def is_required_for_insert(self) -> bool:
|
|
207
|
+
"""Returns True if column is required when inserting rows."""
|
|
208
|
+
return not self.col_type.nullable and not self.is_computed
|
|
209
|
+
|
|
205
210
|
def source(self) -> None:
|
|
206
211
|
"""
|
|
207
212
|
If this is a computed col and the top-level expr is a function call, print the source, if possible.
|
pixeltable/catalog/dir.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import dataclasses
|
|
4
|
+
import datetime
|
|
5
|
+
import json
|
|
4
6
|
import logging
|
|
5
7
|
from uuid import UUID
|
|
6
8
|
|
|
7
9
|
import sqlalchemy as sql
|
|
10
|
+
from sqlalchemy.dialects.postgresql import JSONB
|
|
8
11
|
|
|
9
12
|
from pixeltable.env import Env
|
|
10
13
|
from pixeltable.metadata import schema
|
|
@@ -26,6 +29,7 @@ class Dir(SchemaObject):
|
|
|
26
29
|
dir_record = schema.Dir(parent_id=parent_id, md=dataclasses.asdict(dir_md))
|
|
27
30
|
session.add(dir_record)
|
|
28
31
|
session.flush()
|
|
32
|
+
# print(f'{datetime.datetime.now()} create dir {dir_record}')
|
|
29
33
|
assert dir_record.id is not None
|
|
30
34
|
assert isinstance(dir_record.id, UUID)
|
|
31
35
|
dir = cls(dir_record.id, parent_id, name)
|
|
@@ -43,11 +47,16 @@ class Dir(SchemaObject):
|
|
|
43
47
|
return super()._path()
|
|
44
48
|
|
|
45
49
|
def _move(self, new_name: str, new_dir_id: UUID) -> None:
|
|
50
|
+
# print(
|
|
51
|
+
# f'{datetime.datetime.now()} move dir name={self._name} parent={self._dir_id} new_name={new_name} new_dir_id={new_dir_id}'
|
|
52
|
+
# )
|
|
46
53
|
super()._move(new_name, new_dir_id)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
54
|
+
stmt = sql.text(
|
|
55
|
+
(
|
|
56
|
+
f'UPDATE {schema.Dir.__table__} '
|
|
57
|
+
f'SET {schema.Dir.parent_id.name} = :new_dir_id, '
|
|
58
|
+
f" {schema.Dir.md.name}['name'] = :new_name "
|
|
59
|
+
f'WHERE {schema.Dir.id.name} = :id'
|
|
53
60
|
)
|
|
61
|
+
)
|
|
62
|
+
Env.get().conn.execute(stmt, {'new_dir_id': new_dir_id, 'new_name': json.dumps(new_name), 'id': self._id})
|
pixeltable/catalog/globals.py
CHANGED
|
@@ -40,6 +40,22 @@ class UpdateStatus:
|
|
|
40
40
|
self.cols_with_excs = list(dict.fromkeys(self.cols_with_excs + other.cols_with_excs))
|
|
41
41
|
return self
|
|
42
42
|
|
|
43
|
+
@property
|
|
44
|
+
def insert_msg(self) -> str:
|
|
45
|
+
"""Return a message describing the results of an insert operation."""
|
|
46
|
+
if self.num_excs == 0:
|
|
47
|
+
cols_with_excs_str = ''
|
|
48
|
+
else:
|
|
49
|
+
cols_with_excs_str = (
|
|
50
|
+
f' across {len(self.cols_with_excs)} column{"" if len(self.cols_with_excs) == 1 else "s"}'
|
|
51
|
+
)
|
|
52
|
+
cols_with_excs_str += f' ({", ".join(self.cols_with_excs)})'
|
|
53
|
+
msg = (
|
|
54
|
+
f'Inserted {self.num_rows} row{"" if self.num_rows == 1 else "s"} '
|
|
55
|
+
f'with {self.num_excs} error{"" if self.num_excs == 1 else "s"}{cols_with_excs_str}.'
|
|
56
|
+
)
|
|
57
|
+
return msg
|
|
58
|
+
|
|
43
59
|
|
|
44
60
|
class MediaValidation(enum.Enum):
|
|
45
61
|
ON_READ = 0
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import enum
|
|
3
4
|
import logging
|
|
4
|
-
from typing import Any, Iterable, Literal, Optional, overload
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional, overload
|
|
5
6
|
from uuid import UUID
|
|
6
7
|
|
|
7
8
|
import pixeltable as pxt
|
|
@@ -16,9 +17,36 @@ from .table_version import TableVersion
|
|
|
16
17
|
from .table_version_handle import TableVersionHandle
|
|
17
18
|
from .table_version_path import TableVersionPath
|
|
18
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import datasets # type: ignore[import-untyped]
|
|
22
|
+
|
|
23
|
+
from pixeltable.globals import RowData, TableDataSource
|
|
24
|
+
from pixeltable.io.table_data_conduit import TableDataConduit
|
|
25
|
+
|
|
19
26
|
_logger = logging.getLogger('pixeltable')
|
|
20
27
|
|
|
21
28
|
|
|
29
|
+
class OnErrorParameter(enum.Enum):
|
|
30
|
+
"""Supported values for the on_error parameter"""
|
|
31
|
+
|
|
32
|
+
ABORT = 'abort'
|
|
33
|
+
IGNORE = 'ignore'
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def is_valid(cls, v: Any) -> bool:
|
|
37
|
+
if isinstance(v, str):
|
|
38
|
+
return v.lower() in [c.value for c in cls]
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def fail_on_exception(cls, v: Any) -> bool:
|
|
43
|
+
if not cls.is_valid(v):
|
|
44
|
+
raise ValueError(f'Invalid value for on_error: {v}')
|
|
45
|
+
if isinstance(v, str):
|
|
46
|
+
return v.lower() != cls.IGNORE.value
|
|
47
|
+
return True
|
|
48
|
+
|
|
49
|
+
|
|
22
50
|
class InsertableTable(Table):
|
|
23
51
|
"""A `Table` that allows inserting and deleting rows."""
|
|
24
52
|
|
|
@@ -86,62 +114,75 @@ class InsertableTable(Table):
|
|
|
86
114
|
@overload
|
|
87
115
|
def insert(
|
|
88
116
|
self,
|
|
89
|
-
|
|
117
|
+
source: Optional[TableDataSource] = None,
|
|
90
118
|
/,
|
|
91
119
|
*,
|
|
92
|
-
|
|
120
|
+
source_format: Optional[Literal['csv', 'excel', 'parquet', 'json']] = None,
|
|
121
|
+
schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
93
122
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
123
|
+
print_stats: bool = False,
|
|
124
|
+
**kwargs: Any,
|
|
94
125
|
) -> UpdateStatus: ...
|
|
95
126
|
|
|
96
127
|
@overload
|
|
97
128
|
def insert(
|
|
98
|
-
self, *,
|
|
129
|
+
self, /, *, on_error: Literal['abort', 'ignore'] = 'abort', print_stats: bool = False, **kwargs: Any
|
|
99
130
|
) -> UpdateStatus: ...
|
|
100
131
|
|
|
101
|
-
def insert(
|
|
132
|
+
def insert(
|
|
102
133
|
self,
|
|
103
|
-
|
|
134
|
+
source: Optional[TableDataSource] = None,
|
|
104
135
|
/,
|
|
105
136
|
*,
|
|
106
|
-
|
|
137
|
+
source_format: Optional[Literal['csv', 'excel', 'parquet', 'json']] = None,
|
|
138
|
+
schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
107
139
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
140
|
+
print_stats: bool = False,
|
|
108
141
|
**kwargs: Any,
|
|
109
142
|
) -> UpdateStatus:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
if not isinstance(rows, list):
|
|
120
|
-
raise excs.Error('rows must be a list of dictionaries')
|
|
121
|
-
if len(rows) == 0:
|
|
122
|
-
raise excs.Error('rows must not be empty')
|
|
123
|
-
for row in rows:
|
|
124
|
-
if not isinstance(row, dict):
|
|
125
|
-
raise excs.Error('rows must be a list of dictionaries')
|
|
126
|
-
self._validate_input_rows(rows)
|
|
127
|
-
with Env.get().begin_xact():
|
|
128
|
-
status = self._tbl_version.get().insert(
|
|
129
|
-
rows, None, print_stats=print_stats, fail_on_exception=fail_on_exception
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
if status.num_excs == 0:
|
|
133
|
-
cols_with_excs_str = ''
|
|
134
|
-
else:
|
|
135
|
-
cols_with_excs_str = (
|
|
136
|
-
f' across {len(status.cols_with_excs)} column{"" if len(status.cols_with_excs) == 1 else "s"}'
|
|
137
|
-
)
|
|
138
|
-
cols_with_excs_str += f' ({", ".join(status.cols_with_excs)})'
|
|
139
|
-
msg = (
|
|
140
|
-
f'Inserted {status.num_rows} row{"" if status.num_rows == 1 else "s"} '
|
|
141
|
-
f'with {status.num_excs} error{"" if status.num_excs == 1 else "s"}{cols_with_excs_str}.'
|
|
143
|
+
from pixeltable.io.table_data_conduit import UnkTableDataConduit
|
|
144
|
+
|
|
145
|
+
table = self
|
|
146
|
+
if source is None:
|
|
147
|
+
source = [kwargs]
|
|
148
|
+
kwargs = None
|
|
149
|
+
|
|
150
|
+
tds = UnkTableDataConduit(
|
|
151
|
+
source, source_format=source_format, src_schema_overrides=schema_overrides, extra_fields=kwargs
|
|
142
152
|
)
|
|
143
|
-
|
|
144
|
-
|
|
153
|
+
data_source = tds.specialize()
|
|
154
|
+
if data_source.source_column_map is None:
|
|
155
|
+
data_source.src_pk = []
|
|
156
|
+
|
|
157
|
+
assert isinstance(table, Table)
|
|
158
|
+
data_source.add_table_info(table)
|
|
159
|
+
data_source.prepare_for_insert_into_table()
|
|
160
|
+
|
|
161
|
+
fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
|
|
162
|
+
return table.insert_table_data_source(
|
|
163
|
+
data_source=data_source, fail_on_exception=fail_on_exception, print_stats=print_stats
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
def insert_table_data_source(
|
|
167
|
+
self, data_source: TableDataConduit, fail_on_exception: bool, print_stats: bool = False
|
|
168
|
+
) -> pxt.UpdateStatus:
|
|
169
|
+
"""Insert row batches into this table from a `TableDataConduit`."""
|
|
170
|
+
from pixeltable.io.table_data_conduit import DFTableDataConduit, TableDataConduit
|
|
171
|
+
|
|
172
|
+
status = pxt.UpdateStatus()
|
|
173
|
+
with Env.get().begin_xact():
|
|
174
|
+
if isinstance(data_source, DFTableDataConduit):
|
|
175
|
+
status += self._tbl_version.get().insert(
|
|
176
|
+
rows=None, df=data_source.pxt_df, print_stats=print_stats, fail_on_exception=fail_on_exception
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
for row_batch in data_source.valid_row_batch():
|
|
180
|
+
status += self._tbl_version.get().insert(
|
|
181
|
+
rows=row_batch, df=None, print_stats=print_stats, fail_on_exception=fail_on_exception
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
Env.get().console_logger.info(status.insert_msg)
|
|
185
|
+
|
|
145
186
|
FileCache.get().emit_eviction_warnings()
|
|
146
187
|
return status
|
|
147
188
|
|
pixeltable/catalog/path.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
from typing import Iterator
|
|
4
5
|
|
|
5
6
|
from pixeltable import exceptions as excs
|
|
6
7
|
|
|
@@ -55,5 +56,19 @@ class Path:
|
|
|
55
56
|
is_prefix = self.components == other.components[: self.len]
|
|
56
57
|
return is_prefix and (self.len == (other.len - 1) or not is_parent)
|
|
57
58
|
|
|
59
|
+
def ancestors(self) -> Iterator[Path]:
|
|
60
|
+
"""
|
|
61
|
+
Return all ancestors of this path in top-down order including root.
|
|
62
|
+
If this path is for the root directory, which has no parent, then None is returned.
|
|
63
|
+
"""
|
|
64
|
+
if self.is_root:
|
|
65
|
+
return
|
|
66
|
+
else:
|
|
67
|
+
for i in range(0, len(self.components)):
|
|
68
|
+
yield Path('.'.join(self.components[0:i]), empty_is_valid=True)
|
|
69
|
+
|
|
58
70
|
def __str__(self) -> str:
|
|
59
71
|
return '.'.join(self.components)
|
|
72
|
+
|
|
73
|
+
def __lt__(self, other: Path) -> bool:
|
|
74
|
+
return str(self) < str(other)
|
|
@@ -2,7 +2,7 @@ from abc import abstractmethod
|
|
|
2
2
|
from typing import TYPE_CHECKING, Any, Optional
|
|
3
3
|
from uuid import UUID
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
from pixeltable.env import Env
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from pixeltable import catalog
|
|
@@ -28,24 +28,19 @@ class SchemaObject:
|
|
|
28
28
|
"""Returns the parent directory of this schema object."""
|
|
29
29
|
from .catalog import Catalog
|
|
30
30
|
|
|
31
|
-
with
|
|
31
|
+
with Env.get().begin_xact():
|
|
32
32
|
if self._dir_id is None:
|
|
33
33
|
return None
|
|
34
34
|
return Catalog.get().get_dir(self._dir_id)
|
|
35
35
|
|
|
36
36
|
def _path(self) -> str:
|
|
37
37
|
"""Returns the path to this schema object."""
|
|
38
|
-
|
|
39
|
-
from .catalog import Catalog
|
|
38
|
+
from .catalog import Catalog
|
|
40
39
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
# root directory. Either way, we return just the name.
|
|
46
|
-
return self._name
|
|
47
|
-
else:
|
|
48
|
-
return f'{dir_path}.{self._name}'
|
|
40
|
+
assert self._dir_id is not None
|
|
41
|
+
with Env.get().begin_xact():
|
|
42
|
+
path = Catalog.get().get_dir_path(self._dir_id)
|
|
43
|
+
return str(path.append(self._name))
|
|
49
44
|
|
|
50
45
|
def get_metadata(self) -> dict[str, Any]:
|
|
51
46
|
"""Returns metadata associated with this schema object."""
|
pixeltable/catalog/table.py
CHANGED
|
@@ -8,6 +8,7 @@ from pathlib import Path
|
|
|
8
8
|
from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional, Union, overload
|
|
9
9
|
|
|
10
10
|
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
11
|
+
from keyword import iskeyword as is_python_keyword
|
|
11
12
|
from uuid import UUID
|
|
12
13
|
|
|
13
14
|
import pandas as pd
|
|
@@ -42,9 +43,11 @@ from .table_version_handle import TableVersionHandle
|
|
|
42
43
|
from .table_version_path import TableVersionPath
|
|
43
44
|
|
|
44
45
|
if TYPE_CHECKING:
|
|
46
|
+
import datasets # type: ignore[import-untyped]
|
|
45
47
|
import torch.utils.data
|
|
46
48
|
|
|
47
49
|
import pixeltable.plan
|
|
50
|
+
from pixeltable.globals import RowData, TableDataSource
|
|
48
51
|
|
|
49
52
|
_logger = logging.getLogger('pixeltable')
|
|
50
53
|
|
|
@@ -171,8 +174,8 @@ class Table(SchemaObject):
|
|
|
171
174
|
|
|
172
175
|
def _get_views(self, *, recursive: bool = True) -> list['Table']:
|
|
173
176
|
cat = catalog.Catalog.get()
|
|
174
|
-
view_ids = cat.
|
|
175
|
-
views = [cat.
|
|
177
|
+
view_ids = cat.get_view_ids(self._id)
|
|
178
|
+
views = [cat.get_table_by_id(id) for id in view_ids]
|
|
176
179
|
if recursive:
|
|
177
180
|
views.extend([t for view in views for t in view._get_views(recursive=True)])
|
|
178
181
|
return views
|
|
@@ -265,7 +268,7 @@ class Table(SchemaObject):
|
|
|
265
268
|
if self._tbl_version_path.base is None:
|
|
266
269
|
return None
|
|
267
270
|
base_id = self._tbl_version_path.base.tbl_version.id
|
|
268
|
-
return catalog.Catalog.get().
|
|
271
|
+
return catalog.Catalog.get().get_table_by_id(base_id)
|
|
269
272
|
|
|
270
273
|
@property
|
|
271
274
|
def _bases(self) -> list['Table']:
|
|
@@ -369,11 +372,6 @@ class Table(SchemaObject):
|
|
|
369
372
|
pd_rows.append(row)
|
|
370
373
|
return pd.DataFrame(pd_rows)
|
|
371
374
|
|
|
372
|
-
def ensure_md_loaded(self) -> None:
|
|
373
|
-
"""Ensure that table metadata is loaded."""
|
|
374
|
-
for col in self._tbl_version.get().cols_by_id.values():
|
|
375
|
-
_ = col.value_expr
|
|
376
|
-
|
|
377
375
|
def describe(self) -> None:
|
|
378
376
|
"""
|
|
379
377
|
Print the table schema.
|
|
@@ -387,13 +385,9 @@ class Table(SchemaObject):
|
|
|
387
385
|
print(repr(self))
|
|
388
386
|
|
|
389
387
|
def _drop(self) -> None:
|
|
390
|
-
cat = catalog.Catalog.get()
|
|
391
388
|
self._check_is_dropped()
|
|
392
389
|
self._tbl_version.get().drop()
|
|
393
390
|
self._is_dropped = True
|
|
394
|
-
# update catalog
|
|
395
|
-
cat = catalog.Catalog.get()
|
|
396
|
-
cat.remove_tbl(self._id)
|
|
397
391
|
|
|
398
392
|
# TODO Factor this out into a separate module.
|
|
399
393
|
# The return type is unresolvable, but torch can't be imported since it's an optional dependency.
|
|
@@ -729,13 +723,18 @@ class Table(SchemaObject):
|
|
|
729
723
|
columns.append(column)
|
|
730
724
|
return columns
|
|
731
725
|
|
|
726
|
+
@classmethod
|
|
727
|
+
def validate_column_name(cls, name: str) -> None:
|
|
728
|
+
"""Check that a name is usable as a pixeltalbe column name"""
|
|
729
|
+
if is_system_column_name(name) or is_python_keyword(name):
|
|
730
|
+
raise excs.Error(f'{name!r} is a reserved name in Pixeltable; please choose a different column name.')
|
|
731
|
+
if not is_valid_identifier(name):
|
|
732
|
+
raise excs.Error(f'Invalid column name: {name!r}')
|
|
733
|
+
|
|
732
734
|
@classmethod
|
|
733
735
|
def _verify_column(cls, col: Column) -> None:
|
|
734
736
|
"""Check integrity of user-supplied Column and supply defaults"""
|
|
735
|
-
|
|
736
|
-
raise excs.Error(f'{col.name!r} is a reserved name in Pixeltable; please choose a different column name.')
|
|
737
|
-
if not is_valid_identifier(col.name):
|
|
738
|
-
raise excs.Error(f'Invalid column name: {col.name!r}')
|
|
737
|
+
cls.validate_column_name(col.name)
|
|
739
738
|
if col.stored is False and not col.is_computed:
|
|
740
739
|
raise excs.Error(f'Column {col.name!r}: stored={col.stored} only applies to computed columns')
|
|
741
740
|
if col.stored is False and col.has_window_fn_call():
|
|
@@ -754,16 +753,6 @@ class Table(SchemaObject):
|
|
|
754
753
|
cls._verify_column(col)
|
|
755
754
|
column_names.add(col.name)
|
|
756
755
|
|
|
757
|
-
def __check_column_name_exists(self, column_name: str, include_bases: bool = False) -> None:
|
|
758
|
-
col = self._tbl_version_path.get_column(column_name, include_bases)
|
|
759
|
-
if col is None:
|
|
760
|
-
raise excs.Error(f'Column {column_name!r} unknown')
|
|
761
|
-
|
|
762
|
-
def __check_column_ref_exists(self, col_ref: ColumnRef, include_bases: bool = False) -> None:
|
|
763
|
-
exists = self._tbl_version_path.has_column(col_ref.col, include_bases)
|
|
764
|
-
if not exists:
|
|
765
|
-
raise excs.Error(f'Unknown column: {col_ref.col.qualified_name}')
|
|
766
|
-
|
|
767
756
|
def drop_column(self, column: Union[str, ColumnRef], if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
|
|
768
757
|
"""Drop a column from the table.
|
|
769
758
|
|
|
@@ -916,7 +905,7 @@ class Table(SchemaObject):
|
|
|
916
905
|
Args:
|
|
917
906
|
column: The name of, or reference to, the column to be indexed; must be a `String` or `Image` column.
|
|
918
907
|
idx_name: An optional name for the index. If not specified, a name such as `'idx0'` will be generated
|
|
919
|
-
automatically. If specified, the name must be unique for this table.
|
|
908
|
+
automatically. If specified, the name must be unique for this table and a valid pixeltable column name.
|
|
920
909
|
embedding: The UDF to use for the embedding. Must be a UDF that accepts a single argument of type `String`
|
|
921
910
|
or `Image` (as appropriate for the column being indexed) and returns a fixed-size 1-dimensional
|
|
922
911
|
array of floats.
|
|
@@ -969,13 +958,7 @@ class Table(SchemaObject):
|
|
|
969
958
|
"""
|
|
970
959
|
if self._tbl_version_path.is_snapshot():
|
|
971
960
|
raise excs.Error('Cannot add an index to a snapshot')
|
|
972
|
-
col
|
|
973
|
-
if isinstance(column, str):
|
|
974
|
-
self.__check_column_name_exists(column, include_bases=True)
|
|
975
|
-
col = self._tbl_version_path.get_column(column, include_bases=True)
|
|
976
|
-
else:
|
|
977
|
-
self.__check_column_ref_exists(column, include_bases=True)
|
|
978
|
-
col = column.col
|
|
961
|
+
col = self._resolve_column_parameter(column)
|
|
979
962
|
|
|
980
963
|
with Env.get().begin_xact():
|
|
981
964
|
if idx_name is not None and idx_name in self._tbl_version.get().idxs_by_name:
|
|
@@ -995,6 +978,10 @@ class Table(SchemaObject):
|
|
|
995
978
|
assert idx_name not in self._tbl_version.get().idxs_by_name
|
|
996
979
|
from pixeltable.index import EmbeddingIndex
|
|
997
980
|
|
|
981
|
+
# idx_name must be a valid pixeltable column name
|
|
982
|
+
if idx_name is not None:
|
|
983
|
+
Table.validate_column_name(idx_name)
|
|
984
|
+
|
|
998
985
|
# create the EmbeddingIndex instance to verify args
|
|
999
986
|
idx = EmbeddingIndex(
|
|
1000
987
|
col, metric=metric, embed=embedding, string_embed=string_embed, image_embed=image_embed
|
|
@@ -1058,17 +1045,28 @@ class Table(SchemaObject):
|
|
|
1058
1045
|
|
|
1059
1046
|
col: Column = None
|
|
1060
1047
|
if idx_name is None:
|
|
1061
|
-
|
|
1062
|
-
self.__check_column_name_exists(column, include_bases=True)
|
|
1063
|
-
col = self._tbl_version_path.get_column(column, include_bases=True)
|
|
1064
|
-
else:
|
|
1065
|
-
self.__check_column_ref_exists(column, include_bases=True)
|
|
1066
|
-
col = column.col
|
|
1048
|
+
col = self._resolve_column_parameter(column)
|
|
1067
1049
|
assert col is not None
|
|
1068
1050
|
|
|
1069
1051
|
with Env.get().begin_xact():
|
|
1070
1052
|
self._drop_index(col=col, idx_name=idx_name, _idx_class=index.EmbeddingIndex, if_not_exists=if_not_exists)
|
|
1071
1053
|
|
|
1054
|
+
def _resolve_column_parameter(self, column: Union[str, ColumnRef]) -> Column:
|
|
1055
|
+
"""Resolve a column parameter to a Column object"""
|
|
1056
|
+
col: Column = None
|
|
1057
|
+
if isinstance(column, str):
|
|
1058
|
+
col = self._tbl_version_path.get_column(column, include_bases=True)
|
|
1059
|
+
if col is None:
|
|
1060
|
+
raise excs.Error(f'Column {column!r} unknown')
|
|
1061
|
+
elif isinstance(column, ColumnRef):
|
|
1062
|
+
exists = self._tbl_version_path.has_column(column.col, include_bases=True)
|
|
1063
|
+
if not exists:
|
|
1064
|
+
raise excs.Error(f'Unknown column: {column.col.qualified_name}')
|
|
1065
|
+
col = column.col
|
|
1066
|
+
else:
|
|
1067
|
+
raise excs.Error(f'Invalid column parameter type: {type(column)}')
|
|
1068
|
+
return col
|
|
1069
|
+
|
|
1072
1070
|
def drop_index(
|
|
1073
1071
|
self,
|
|
1074
1072
|
*,
|
|
@@ -1124,12 +1122,7 @@ class Table(SchemaObject):
|
|
|
1124
1122
|
|
|
1125
1123
|
col: Column = None
|
|
1126
1124
|
if idx_name is None:
|
|
1127
|
-
|
|
1128
|
-
self.__check_column_name_exists(column, include_bases=True)
|
|
1129
|
-
col = self._tbl_version_path.get_column(column, include_bases=True)
|
|
1130
|
-
else:
|
|
1131
|
-
self.__check_column_ref_exists(column, include_bases=True)
|
|
1132
|
-
col = column.col
|
|
1125
|
+
col = self._resolve_column_parameter(column)
|
|
1133
1126
|
assert col is not None
|
|
1134
1127
|
|
|
1135
1128
|
with Env.get().begin_xact():
|
|
@@ -1154,49 +1147,62 @@ class Table(SchemaObject):
|
|
|
1154
1147
|
raise excs.Error(f'Index {idx_name!r} does not exist')
|
|
1155
1148
|
assert _if_not_exists == IfNotExistsParam.IGNORE
|
|
1156
1149
|
return
|
|
1157
|
-
|
|
1150
|
+
idx_info = self._tbl_version.get().idxs_by_name[idx_name]
|
|
1158
1151
|
else:
|
|
1159
1152
|
if col.tbl.id != self._tbl_version.id:
|
|
1160
1153
|
raise excs.Error(
|
|
1161
1154
|
f'Column {col.name!r}: cannot drop index from column that belongs to base ({col.tbl.get().name}!r)'
|
|
1162
1155
|
)
|
|
1163
|
-
|
|
1156
|
+
idx_info_list = [info for info in self._tbl_version.get().idxs_by_name.values() if info.col.id == col.id]
|
|
1164
1157
|
if _idx_class is not None:
|
|
1165
|
-
|
|
1166
|
-
if len(
|
|
1158
|
+
idx_info_list = [info for info in idx_info_list if isinstance(info.idx, _idx_class)]
|
|
1159
|
+
if len(idx_info_list) == 0:
|
|
1167
1160
|
_if_not_exists = IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
1168
1161
|
if _if_not_exists == IfNotExistsParam.ERROR:
|
|
1169
1162
|
raise excs.Error(f'Column {col.name!r} does not have an index')
|
|
1170
1163
|
assert _if_not_exists == IfNotExistsParam.IGNORE
|
|
1171
1164
|
return
|
|
1172
|
-
if len(
|
|
1165
|
+
if len(idx_info_list) > 1:
|
|
1173
1166
|
raise excs.Error(f"Column {col.name!r} has multiple indices; specify 'idx_name' instead")
|
|
1174
|
-
|
|
1175
|
-
|
|
1167
|
+
idx_info = idx_info_list[0]
|
|
1168
|
+
|
|
1169
|
+
# Find out if anything depends on this index
|
|
1170
|
+
dependent_user_cols = [c for c in idx_info.val_col.dependent_cols if c.name is not None]
|
|
1171
|
+
if len(dependent_user_cols) > 0:
|
|
1172
|
+
raise excs.Error(
|
|
1173
|
+
f'Cannot drop index because the following columns depend on it:\n'
|
|
1174
|
+
f'{", ".join(c.name for c in dependent_user_cols)}'
|
|
1175
|
+
)
|
|
1176
|
+
self._tbl_version.get().drop_index(idx_info.id)
|
|
1176
1177
|
|
|
1177
1178
|
@overload
|
|
1178
1179
|
def insert(
|
|
1179
1180
|
self,
|
|
1180
|
-
|
|
1181
|
+
source: TableDataSource,
|
|
1181
1182
|
/,
|
|
1182
1183
|
*,
|
|
1183
|
-
|
|
1184
|
+
source_format: Optional[Literal['csv', 'excel', 'parquet', 'json']] = None,
|
|
1185
|
+
schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
1184
1186
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
1187
|
+
print_stats: bool = False,
|
|
1188
|
+
**kwargs: Any,
|
|
1185
1189
|
) -> UpdateStatus: ...
|
|
1186
1190
|
|
|
1187
1191
|
@overload
|
|
1188
1192
|
def insert(
|
|
1189
|
-
self, *,
|
|
1193
|
+
self, /, *, on_error: Literal['abort', 'ignore'] = 'abort', print_stats: bool = False, **kwargs: Any
|
|
1190
1194
|
) -> UpdateStatus: ...
|
|
1191
1195
|
|
|
1192
|
-
@abc.abstractmethod
|
|
1196
|
+
@abc.abstractmethod
|
|
1193
1197
|
def insert(
|
|
1194
1198
|
self,
|
|
1195
|
-
|
|
1199
|
+
source: Optional[TableDataSource] = None,
|
|
1196
1200
|
/,
|
|
1197
1201
|
*,
|
|
1198
|
-
|
|
1202
|
+
source_format: Optional[Literal['csv', 'excel', 'parquet', 'json']] = None,
|
|
1203
|
+
schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
1199
1204
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
1205
|
+
print_stats: bool = False,
|
|
1200
1206
|
**kwargs: Any,
|
|
1201
1207
|
) -> UpdateStatus:
|
|
1202
1208
|
"""Inserts rows into this table. There are two mutually exclusive call patterns:
|
|
@@ -1205,11 +1211,12 @@ class Table(SchemaObject):
|
|
|
1205
1211
|
|
|
1206
1212
|
```python
|
|
1207
1213
|
insert(
|
|
1208
|
-
|
|
1214
|
+
source: TableSourceDataType,
|
|
1209
1215
|
/,
|
|
1210
1216
|
*,
|
|
1217
|
+
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
1211
1218
|
print_stats: bool = False,
|
|
1212
|
-
|
|
1219
|
+
**kwargs: Any,
|
|
1213
1220
|
)```
|
|
1214
1221
|
|
|
1215
1222
|
To insert just a single row, you can use the more concise syntax:
|
|
@@ -1217,23 +1224,25 @@ class Table(SchemaObject):
|
|
|
1217
1224
|
```python
|
|
1218
1225
|
insert(
|
|
1219
1226
|
*,
|
|
1220
|
-
print_stats: bool = False,
|
|
1221
1227
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
1228
|
+
print_stats: bool = False,
|
|
1222
1229
|
**kwargs: Any
|
|
1223
1230
|
)```
|
|
1224
1231
|
|
|
1225
1232
|
Args:
|
|
1226
|
-
|
|
1227
|
-
names to values.
|
|
1233
|
+
source: A data source from which data can be imported.
|
|
1228
1234
|
kwargs: (if inserting a single row) Keyword-argument pairs representing column names and values.
|
|
1229
|
-
|
|
1235
|
+
(if inserting multiple rows) Additional keyword arguments are passed to the data source.
|
|
1236
|
+
source_format: A hint about the format of the source data
|
|
1237
|
+
schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
|
|
1230
1238
|
on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
|
|
1231
1239
|
invalid media file (such as a corrupt image) for one of the inserted rows.
|
|
1232
1240
|
|
|
1233
1241
|
- If `on_error='abort'`, then an exception will be raised and the rows will not be inserted.
|
|
1234
1242
|
- If `on_error='ignore'`, then execution will continue and the rows will be inserted. Any cells
|
|
1235
|
-
|
|
1236
|
-
|
|
1243
|
+
with errors will have a `None` value for that cell, with information about the error stored in the
|
|
1244
|
+
corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
|
|
1245
|
+
print_stats: If `True`, print statistics about the cost of computed columns.
|
|
1237
1246
|
|
|
1238
1247
|
Returns:
|
|
1239
1248
|
An [`UpdateStatus`][pixeltable.UpdateStatus] object containing information about the update.
|
|
@@ -1245,6 +1254,7 @@ class Table(SchemaObject):
|
|
|
1245
1254
|
- The table has been dropped.
|
|
1246
1255
|
- One of the rows being inserted does not conform to the table schema.
|
|
1247
1256
|
- An error occurs during processing of computed columns, and `on_error='ignore'`.
|
|
1257
|
+
- An error occurs while importing data from a source, and `on_error='abort'`.
|
|
1248
1258
|
|
|
1249
1259
|
Examples:
|
|
1250
1260
|
Insert two rows into the table `my_table` with three int columns ``a``, ``b``, and ``c``.
|
|
@@ -1256,6 +1266,10 @@ class Table(SchemaObject):
|
|
|
1256
1266
|
Insert a single row using the alternative syntax:
|
|
1257
1267
|
|
|
1258
1268
|
>>> tbl.insert(a=3, b=3, c=3)
|
|
1269
|
+
|
|
1270
|
+
Insert rows from a CSV file:
|
|
1271
|
+
|
|
1272
|
+
>>> tbl.insert(source='path/to/file.csv')
|
|
1259
1273
|
"""
|
|
1260
1274
|
raise NotImplementedError
|
|
1261
1275
|
|