pixeltable 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +34 -6
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +200 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +191 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +581 -0
- pixeltable/catalog/table_version.py +749 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/client.py +520 -30
- pixeltable/dataframe.py +540 -349
- pixeltable/env.py +373 -45
- pixeltable/exceptions.py +12 -21
- pixeltable/exec/__init__.py +9 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +113 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +95 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +69 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +225 -0
- pixeltable/exprs/__init__.py +24 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +105 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +187 -0
- pixeltable/exprs/expr.py +586 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +380 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +115 -0
- pixeltable/exprs/image_similarity_predicate.py +58 -0
- pixeltable/exprs/inline_array.py +107 -0
- pixeltable/exprs/inline_dict.py +101 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +54 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +355 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/func/__init__.py +9 -0
- pixeltable/func/aggregate_function.py +194 -0
- pixeltable/func/batched_function.py +53 -0
- pixeltable/func/callable_function.py +69 -0
- pixeltable/func/expr_template_function.py +82 -0
- pixeltable/func/function.py +110 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +36 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +166 -0
- pixeltable/func/udf.py +163 -0
- pixeltable/functions/__init__.py +52 -103
- pixeltable/functions/eval.py +216 -0
- pixeltable/functions/fireworks.py +61 -0
- pixeltable/functions/huggingface.py +120 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +88 -0
- pixeltable/functions/pil/image.py +148 -7
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +27 -0
- pixeltable/functions/util.py +41 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +48 -0
- pixeltable/iterators/document.py +311 -0
- pixeltable/iterators/video.py +89 -0
- pixeltable/metadata/__init__.py +54 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/schema.py +211 -0
- pixeltable/plan.py +656 -0
- pixeltable/store.py +413 -182
- pixeltable/tests/conftest.py +143 -87
- pixeltable/tests/test_audio.py +65 -0
- pixeltable/tests/test_catalog.py +27 -0
- pixeltable/tests/test_client.py +14 -14
- pixeltable/tests/test_component_view.py +372 -0
- pixeltable/tests/test_dataframe.py +433 -0
- pixeltable/tests/test_dirs.py +78 -62
- pixeltable/tests/test_document.py +117 -0
- pixeltable/tests/test_exprs.py +591 -135
- pixeltable/tests/test_function.py +297 -67
- pixeltable/tests/test_functions.py +283 -1
- pixeltable/tests/test_migration.py +43 -0
- pixeltable/tests/test_nos.py +54 -0
- pixeltable/tests/test_snapshot.py +208 -0
- pixeltable/tests/test_table.py +1085 -262
- pixeltable/tests/test_transactional_directory.py +42 -0
- pixeltable/tests/test_types.py +5 -11
- pixeltable/tests/test_video.py +149 -34
- pixeltable/tests/test_view.py +530 -0
- pixeltable/tests/utils.py +186 -45
- pixeltable/tool/create_test_db_dump.py +149 -0
- pixeltable/type_system.py +490 -126
- pixeltable/utils/__init__.py +17 -46
- pixeltable/utils/clip.py +12 -15
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +39 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/parquet.py +126 -0
- pixeltable/utils/pytorch.py +172 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.2.0.dist-info/LICENSE +18 -0
- pixeltable-0.2.0.dist-info/METADATA +117 -0
- pixeltable-0.2.0.dist-info/RECORD +125 -0
- {pixeltable-0.1.1.dist-info → pixeltable-0.2.0.dist-info}/WHEEL +1 -1
- pixeltable/catalog.py +0 -1421
- pixeltable/exprs.py +0 -1745
- pixeltable/function.py +0 -269
- pixeltable/functions/clip.py +0 -10
- pixeltable/functions/pil/__init__.py +0 -23
- pixeltable/functions/tf.py +0 -21
- pixeltable/index.py +0 -57
- pixeltable/tests/test_dict.py +0 -24
- pixeltable/tests/test_tf.py +0 -69
- pixeltable/tf.py +0 -33
- pixeltable/utils/tf.py +0 -33
- pixeltable/utils/video.py +0 -32
- pixeltable-0.1.1.dist-info/METADATA +0 -31
- pixeltable-0.1.1.dist-info/RECORD +0 -36
pixeltable/catalog.py
DELETED
|
@@ -1,1421 +0,0 @@
|
|
|
1
|
-
from typing import Optional, List, Set, Dict, Any, Type, Union, Callable
|
|
2
|
-
import re
|
|
3
|
-
import inspect
|
|
4
|
-
import io
|
|
5
|
-
import os
|
|
6
|
-
import dataclasses
|
|
7
|
-
|
|
8
|
-
import PIL, cv2
|
|
9
|
-
import numpy as np
|
|
10
|
-
from PIL import Image
|
|
11
|
-
from tqdm.autonotebook import tqdm
|
|
12
|
-
import pathlib
|
|
13
|
-
|
|
14
|
-
import pandas as pd
|
|
15
|
-
import sqlalchemy as sql
|
|
16
|
-
import sqlalchemy.orm as orm
|
|
17
|
-
|
|
18
|
-
from pixeltable import store
|
|
19
|
-
from pixeltable.env import Env
|
|
20
|
-
from pixeltable import exceptions as exc
|
|
21
|
-
from pixeltable.type_system import ColumnType
|
|
22
|
-
from pixeltable.utils import clip, video
|
|
23
|
-
from pixeltable import utils
|
|
24
|
-
from pixeltable.index import VectorIndex
|
|
25
|
-
from pixeltable.function import Function, FunctionRegistry
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
_ID_RE = r'[a-zA-Z]\w*'
|
|
29
|
-
_PATH_RE = f'{_ID_RE}(\\.{_ID_RE})*'
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class Column:
|
|
33
|
-
def __init__(
|
|
34
|
-
self, name: str, col_type: Optional[ColumnType] = None,
|
|
35
|
-
computed_with: Optional[Union['Expr', Callable]] = None,
|
|
36
|
-
primary_key: bool = False, nullable: bool = True, col_id: Optional[int] = None,
|
|
37
|
-
value_expr_str: Optional[str] = None, indexed: bool = False):
|
|
38
|
-
"""
|
|
39
|
-
Computed columns: those have a non-None computed_with argument
|
|
40
|
-
- when constructed by the user: 'computed_with' was constructed explicitly and is passed in;
|
|
41
|
-
'value_expr_str' is None and col_type is None
|
|
42
|
-
- when loaded from store: 'value_expr_str' is the serialized form and col_type is set;
|
|
43
|
-
'computed_with' is None
|
|
44
|
-
Computed_with is a Callable:
|
|
45
|
-
- the callable's parameter names must correspond to existing columns in the table for which this Column
|
|
46
|
-
is being used
|
|
47
|
-
- col_type needs to be set to the callable's return type
|
|
48
|
-
|
|
49
|
-
indexed: only valid for image columns; if true, maintains an NN index for this column
|
|
50
|
-
"""
|
|
51
|
-
from pixeltable import exprs
|
|
52
|
-
if re.fullmatch(_ID_RE, name) is None:
|
|
53
|
-
raise exc.BadFormatError(f"Invalid column name: '{name}'")
|
|
54
|
-
self.name = name
|
|
55
|
-
if col_type is None and computed_with is None:
|
|
56
|
-
raise exc.Error(f'Column {name}: col_type is required if computed_with is not specified')
|
|
57
|
-
assert not(value_expr_str is not None and computed_with is not None)
|
|
58
|
-
|
|
59
|
-
self.value_expr: Optional['Expr'] = None
|
|
60
|
-
self.compute_func: Optional[Callable] = None
|
|
61
|
-
if computed_with is not None:
|
|
62
|
-
value_expr = exprs.Expr.from_object(computed_with)
|
|
63
|
-
if value_expr is None:
|
|
64
|
-
# computed_with needs to be a Callable
|
|
65
|
-
if not isinstance(computed_with, Callable):
|
|
66
|
-
raise exc.Error(
|
|
67
|
-
f'Column {name}: computed_with needs to be either a Pixeltable expression or a Callable, '
|
|
68
|
-
f'but it is a {type(computed_with)}')
|
|
69
|
-
if col_type is None:
|
|
70
|
-
raise exc.Error(f'Column {name}: col_type is required if computed_with is a Callable')
|
|
71
|
-
# we need to turn the computed_with function into an Expr, but this requires resolving
|
|
72
|
-
# column name references and for that we need to wait until we're assigned to a Table
|
|
73
|
-
self.compute_func = computed_with
|
|
74
|
-
else:
|
|
75
|
-
self.value_expr = value_expr.copy()
|
|
76
|
-
self.col_type = self.value_expr.col_type
|
|
77
|
-
|
|
78
|
-
if col_type is not None:
|
|
79
|
-
self.col_type = col_type
|
|
80
|
-
assert self.col_type is not None
|
|
81
|
-
|
|
82
|
-
self.value_expr_str = value_expr_str # stored here so it's easily accessible for the Table c'tor
|
|
83
|
-
self.dependent_cols: List[Column] = [] # cols with value_exprs that reference us
|
|
84
|
-
self.id = col_id
|
|
85
|
-
self.primary_key = primary_key
|
|
86
|
-
# computed cols are always nullable
|
|
87
|
-
self.nullable = nullable or computed_with is not None or value_expr_str is not None
|
|
88
|
-
self.sa_col: Optional[sql.schema.Column] = None
|
|
89
|
-
|
|
90
|
-
if indexed and not self.col_type.is_image_type():
|
|
91
|
-
raise exc.Error(f'Column {name}: indexed=True requires ImageType')
|
|
92
|
-
self.is_indexed = indexed
|
|
93
|
-
self.idx: Optional[VectorIndex] = None
|
|
94
|
-
|
|
95
|
-
def to_sql(self) -> str:
|
|
96
|
-
return f'{self.storage_name()} {self.col_type.to_sql()}'
|
|
97
|
-
|
|
98
|
-
@property
|
|
99
|
-
def is_computed(self) -> bool:
|
|
100
|
-
return self.compute_func is not None or self.value_expr is not None
|
|
101
|
-
|
|
102
|
-
def create_sa_col(self) -> None:
|
|
103
|
-
"""
|
|
104
|
-
This needs to be recreated for every new table schema version.
|
|
105
|
-
"""
|
|
106
|
-
self.sa_col = sql.Column(self.storage_name(), self.col_type.to_sa_type(), nullable=self.nullable)
|
|
107
|
-
|
|
108
|
-
def set_idx(self, idx: VectorIndex) -> None:
|
|
109
|
-
self.idx = idx
|
|
110
|
-
|
|
111
|
-
def storage_name(self) -> str:
|
|
112
|
-
assert self.id is not None
|
|
113
|
-
return f'col_{self.id}'
|
|
114
|
-
|
|
115
|
-
def __str__(self) -> str:
|
|
116
|
-
return f'{self.name}: {self.col_type}'
|
|
117
|
-
|
|
118
|
-
def __eq__(self, other: object) -> bool:
|
|
119
|
-
if not isinstance(other, Column):
|
|
120
|
-
return False
|
|
121
|
-
if self.sa_col is None or other.sa_col is None:
|
|
122
|
-
return False
|
|
123
|
-
# if they point to the same table column, they're the same
|
|
124
|
-
return str(self.sa_col) == str(other.sa_col)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
# base class of all addressable objects within a Db
|
|
128
|
-
class SchemaObject:
|
|
129
|
-
def __init__(self, obj_id: int):
|
|
130
|
-
self.id = obj_id
|
|
131
|
-
|
|
132
|
-
@classmethod
|
|
133
|
-
def display_name(cls) -> str:
|
|
134
|
-
"""
|
|
135
|
-
Return name displayed in error messages.
|
|
136
|
-
"""
|
|
137
|
-
assert False
|
|
138
|
-
return ''
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
class DirBase(SchemaObject):
|
|
142
|
-
def __init__(self, dir_id: int):
|
|
143
|
-
super().__init__(dir_id)
|
|
144
|
-
|
|
145
|
-
@classmethod
|
|
146
|
-
def display_name(cls) -> str:
|
|
147
|
-
return 'directory'
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
# contains only MutableTables
|
|
151
|
-
class Dir(DirBase):
|
|
152
|
-
def __init__(self, dir_id: int):
|
|
153
|
-
super().__init__(dir_id)
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
# contains only TableSnapshots
|
|
157
|
-
class SnapshotDir(DirBase):
|
|
158
|
-
def __init__(self, dir_id: int):
|
|
159
|
-
super().__init__(dir_id)
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
class NamedFunction(SchemaObject):
|
|
163
|
-
"""
|
|
164
|
-
Contains references to functions that are named and have a path within a db.
|
|
165
|
-
The Function itself is stored in the FunctionRegistry.
|
|
166
|
-
"""
|
|
167
|
-
def __init__(self, id: int, dir_id: int, name: str):
|
|
168
|
-
super().__init__(id)
|
|
169
|
-
self.dir_id = dir_id
|
|
170
|
-
self.name = name
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
class Table(SchemaObject):
|
|
174
|
-
#def __init__(self, tbl_record: store.Table, schema: List[Column]):
|
|
175
|
-
def __init__(
|
|
176
|
-
self, db_id: int, tbl_id: int, dir_id: int, name: str, version: int, cols: List[Column]):
|
|
177
|
-
super().__init__(tbl_id)
|
|
178
|
-
self.db_id = db_id
|
|
179
|
-
self.dir_id = dir_id
|
|
180
|
-
# TODO: this will be out-of-date after a rename()
|
|
181
|
-
self.name = name
|
|
182
|
-
for pos, col in enumerate(cols):
|
|
183
|
-
if re.fullmatch(_ID_RE, col.name) is None:
|
|
184
|
-
raise exc.BadFormatError(f"Invalid column name: '{col.name}'")
|
|
185
|
-
assert col.id is not None
|
|
186
|
-
self.cols = cols
|
|
187
|
-
self.cols_by_name = {col.name: col for col in cols}
|
|
188
|
-
self.cols_by_id = {col.id: col for col in cols}
|
|
189
|
-
self.version = version
|
|
190
|
-
|
|
191
|
-
# we can't call _load_valid_rowids() here because the storage table may not exist yet
|
|
192
|
-
self.valid_rowids: Set[int] = set()
|
|
193
|
-
|
|
194
|
-
# sqlalchemy-related metadata; used to insert and query the storage table
|
|
195
|
-
self.sa_md = sql.MetaData()
|
|
196
|
-
self._create_sa_tbl()
|
|
197
|
-
self.is_dropped = False
|
|
198
|
-
|
|
199
|
-
# make sure to traverse columns ordered by position = order in which cols were created;
|
|
200
|
-
# this guarantees that references always point backwards
|
|
201
|
-
for col in self.cols:
|
|
202
|
-
if col.value_expr is not None or col.value_expr_str is not None:
|
|
203
|
-
self._record_value_expr(col)
|
|
204
|
-
|
|
205
|
-
def _record_value_expr(self, col: Column) -> None:
|
|
206
|
-
"""
|
|
207
|
-
Update Column.dependent_cols for all cols referenced in col.value_expr.
|
|
208
|
-
Creates col.value_expr if it doesn't exist yet.
|
|
209
|
-
"""
|
|
210
|
-
from pixeltable.exprs import Expr, ColumnRef
|
|
211
|
-
if col.value_expr is None:
|
|
212
|
-
assert col.value_expr_str is not None
|
|
213
|
-
col.value_expr = Expr.deserialize(col.value_expr_str, self)
|
|
214
|
-
|
|
215
|
-
refd_col_ids = [e.col.id for e in col.value_expr.subexprs() if isinstance(e, ColumnRef)]
|
|
216
|
-
refd_cols = [self.cols_by_id[id] for id in refd_col_ids]
|
|
217
|
-
for refd_col in refd_cols:
|
|
218
|
-
refd_col.dependent_cols.append(col)
|
|
219
|
-
|
|
220
|
-
def _load_valid_rowids(self) -> None:
|
|
221
|
-
if not any(col.col_type.is_image_type() for col in self.cols):
|
|
222
|
-
return
|
|
223
|
-
stmt = sql.select(self.rowid_col) \
|
|
224
|
-
.where(self.v_min_col <= self.version) \
|
|
225
|
-
.where(self.v_max_col > self.version)
|
|
226
|
-
with Env.get().engine.begin() as conn:
|
|
227
|
-
rows = conn.execute(stmt)
|
|
228
|
-
for row in rows:
|
|
229
|
-
rowid = row[0]
|
|
230
|
-
self.valid_rowids.add(rowid)
|
|
231
|
-
|
|
232
|
-
def __getattr__(self, col_name: str) -> 'pixeltable.exprs.ColumnRef':
|
|
233
|
-
if col_name not in self.cols_by_name:
|
|
234
|
-
raise AttributeError(f'Column {col_name} unknown')
|
|
235
|
-
col = self.cols_by_name[col_name]
|
|
236
|
-
from pixeltable.exprs import ColumnRef
|
|
237
|
-
return ColumnRef(col)
|
|
238
|
-
|
|
239
|
-
def __getitem__(self, index: object) -> Union['pixeltable.exprs.ColumnRef', 'pixeltable.dataframe.DataFrame']:
|
|
240
|
-
if isinstance(index, str):
|
|
241
|
-
# basically <tbl>.<colname>
|
|
242
|
-
return self.__getattr__(index)
|
|
243
|
-
from pixeltable.dataframe import DataFrame
|
|
244
|
-
return DataFrame(self).__getitem__(index)
|
|
245
|
-
|
|
246
|
-
def df(self) -> 'pixeltable.dataframe.DataFrame':
|
|
247
|
-
# local import: avoid circular imports
|
|
248
|
-
from pixeltable.dataframe import DataFrame
|
|
249
|
-
return DataFrame(self)
|
|
250
|
-
|
|
251
|
-
def show(self, *args, **kwargs) -> 'pixeltable.dataframe.DataFrameResultSet': # type: ignore[name-defined, no-untyped-def]
|
|
252
|
-
return self.df().show(*args, **kwargs)
|
|
253
|
-
|
|
254
|
-
def count(self) -> int:
|
|
255
|
-
return self.df().count()
|
|
256
|
-
|
|
257
|
-
@property
|
|
258
|
-
def columns(self) -> List[Column]:
|
|
259
|
-
return self.cols
|
|
260
|
-
|
|
261
|
-
def storage_name(self) -> str:
|
|
262
|
-
return f'tbl_{self.id}'
|
|
263
|
-
|
|
264
|
-
def _check_is_dropped(self) -> None:
|
|
265
|
-
if self.is_dropped:
|
|
266
|
-
raise exc.OperationalError('Table has been dropped')
|
|
267
|
-
|
|
268
|
-
def _create_sa_tbl(self) -> None:
|
|
269
|
-
self.rowid_col = sql.Column('rowid', sql.BigInteger, nullable=False)
|
|
270
|
-
self.v_min_col = sql.Column('v_min', sql.BigInteger, nullable=False)
|
|
271
|
-
self.v_max_col = \
|
|
272
|
-
sql.Column('v_max', sql.BigInteger, nullable=False, server_default=str(store.Table.MAX_VERSION))
|
|
273
|
-
sa_cols = [self.rowid_col, self.v_min_col, self.v_max_col]
|
|
274
|
-
# re-create sql.Columns for each column, regardless of whether it already has sa_col set: it was bound
|
|
275
|
-
# to the last sql.Table version we created and cannot be reused
|
|
276
|
-
for col in self.cols:
|
|
277
|
-
col.create_sa_col()
|
|
278
|
-
sa_cols.extend([col.sa_col for col in self.cols])
|
|
279
|
-
if hasattr(self, 'sa_tbl'):
|
|
280
|
-
self.sa_md.remove(self.sa_tbl)
|
|
281
|
-
self.sa_tbl = sql.Table(self.storage_name(), self.sa_md, *sa_cols)
|
|
282
|
-
|
|
283
|
-
@classmethod
|
|
284
|
-
def _vector_idx_name(cls, tbl_id: int, col: Column) -> str:
|
|
285
|
-
return f'{tbl_id}_{col.id}'
|
|
286
|
-
|
|
287
|
-
# MODULE-LOCAL, NOT PUBLIC
|
|
288
|
-
@classmethod
|
|
289
|
-
def load_cols(cls, tbl_id: int, schema_version: int, session: orm.Session) -> List[Column]:
|
|
290
|
-
"""
|
|
291
|
-
Returns loaded cols.
|
|
292
|
-
"""
|
|
293
|
-
col_records = session.query(store.SchemaColumn) \
|
|
294
|
-
.where(store.SchemaColumn.tbl_id == tbl_id) \
|
|
295
|
-
.where(store.SchemaColumn.schema_version == schema_version) \
|
|
296
|
-
.order_by(store.SchemaColumn.pos.asc()).all()
|
|
297
|
-
cols = [
|
|
298
|
-
Column(
|
|
299
|
-
r.name, ColumnType.deserialize(r.col_type), primary_key=r.is_pk, nullable=r.is_nullable,
|
|
300
|
-
col_id=r.col_id, value_expr_str=r.value_expr, indexed=r.is_indexed)
|
|
301
|
-
for r in col_records
|
|
302
|
-
]
|
|
303
|
-
for col in [col for col in cols if col.col_type.is_image_type()]:
|
|
304
|
-
if col.is_indexed:
|
|
305
|
-
col.set_idx(VectorIndex.load(cls._vector_idx_name(tbl_id, col), dim=512))
|
|
306
|
-
return cols
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
class TableSnapshot(Table):
|
|
310
|
-
def __init__(self, snapshot_record: store.TableSnapshot, cols: List[Column]):
|
|
311
|
-
assert snapshot_record.db_id is not None
|
|
312
|
-
assert snapshot_record.id is not None
|
|
313
|
-
assert snapshot_record.dir_id is not None
|
|
314
|
-
assert snapshot_record.name is not None
|
|
315
|
-
assert snapshot_record.tbl_version is not None
|
|
316
|
-
# the id of this SchemaObject is TableSnapshot.tbl_id, not TableSnapshot.id: we use tbl_id to construct
|
|
317
|
-
# the name of the data table
|
|
318
|
-
super().__init__(
|
|
319
|
-
snapshot_record.db_id, snapshot_record.tbl_id, snapshot_record.dir_id, snapshot_record.name,
|
|
320
|
-
snapshot_record.tbl_version, cols)
|
|
321
|
-
self.snapshot_tbl_id = snapshot_record.id
|
|
322
|
-
# it's safe to call _load_valid_rowids() here because the storage table already exists
|
|
323
|
-
self._load_valid_rowids()
|
|
324
|
-
|
|
325
|
-
def __repr__(self) -> str:
|
|
326
|
-
return f'TableSnapshot(name={self.name})'
|
|
327
|
-
|
|
328
|
-
@classmethod
|
|
329
|
-
def display_name(cls) -> str:
|
|
330
|
-
return 'table snapshot'
|
|
331
|
-
|
|
332
|
-
@dataclasses.dataclass
|
|
333
|
-
class TableParameters:
|
|
334
|
-
# garbage-collect old versions beyond this point, unless they are referenced in a snapshot
|
|
335
|
-
num_retained_versions: int
|
|
336
|
-
|
|
337
|
-
# parameters for frame extraction
|
|
338
|
-
frame_src_col: int # column id
|
|
339
|
-
frame_col: int # column id
|
|
340
|
-
frame_idx_col: int # column id
|
|
341
|
-
extraction_fps: int
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
class MutableTable(Table):
|
|
345
|
-
def __init__(self, tbl_record: store.Table, schema_version: int, cols: List[Column]):
|
|
346
|
-
assert tbl_record.db_id is not None
|
|
347
|
-
assert tbl_record.id is not None
|
|
348
|
-
assert tbl_record.dir_id is not None
|
|
349
|
-
assert tbl_record.name is not None
|
|
350
|
-
assert tbl_record.current_version is not None
|
|
351
|
-
super().__init__(
|
|
352
|
-
tbl_record.db_id, tbl_record.id, tbl_record.dir_id, tbl_record.name, tbl_record.current_version, cols)
|
|
353
|
-
assert tbl_record.next_col_id is not None
|
|
354
|
-
self.next_col_id = tbl_record.next_col_id
|
|
355
|
-
assert tbl_record.next_row_id is not None
|
|
356
|
-
self.next_row_id = tbl_record.next_row_id
|
|
357
|
-
self.schema_version = schema_version
|
|
358
|
-
self.parameters = TableParameters(**tbl_record.parameters)
|
|
359
|
-
|
|
360
|
-
def __repr__(self) -> str:
|
|
361
|
-
return f'MutableTable(name={self.name})'
|
|
362
|
-
|
|
363
|
-
@classmethod
|
|
364
|
-
def display_name(cls) -> str:
|
|
365
|
-
return 'table'
|
|
366
|
-
|
|
367
|
-
def add_column(self, c: Column) -> None:
|
|
368
|
-
self._check_is_dropped()
|
|
369
|
-
if re.fullmatch(_ID_RE, c.name) is None:
|
|
370
|
-
raise exc.BadFormatError(f"Invalid column name: '{c.name}'")
|
|
371
|
-
if c.name in self.cols_by_name:
|
|
372
|
-
raise exc.DuplicateNameError(f'Column {c.name} already exists')
|
|
373
|
-
assert self.next_col_id is not None
|
|
374
|
-
c.id = self.next_col_id
|
|
375
|
-
self.next_col_id += 1
|
|
376
|
-
|
|
377
|
-
if c.compute_func is not None:
|
|
378
|
-
# create value_expr from compute_func
|
|
379
|
-
self._create_value_expr(c, self.cols_by_name)
|
|
380
|
-
if c.value_expr is not None:
|
|
381
|
-
self._record_value_expr(c)
|
|
382
|
-
|
|
383
|
-
self.cols.append(c)
|
|
384
|
-
self.cols_by_name[c.name] = c
|
|
385
|
-
self.cols_by_id[c.id] = c
|
|
386
|
-
|
|
387
|
-
# we're creating a new schema version
|
|
388
|
-
self.version += 1
|
|
389
|
-
preceding_schema_version = self.schema_version
|
|
390
|
-
self.schema_version = self.version
|
|
391
|
-
|
|
392
|
-
with Env.get().engine.begin() as conn:
|
|
393
|
-
conn.execute(
|
|
394
|
-
sql.update(store.Table.__table__)
|
|
395
|
-
.values({
|
|
396
|
-
store.Table.current_version: self.version,
|
|
397
|
-
store.Table.current_schema_version: self.schema_version,
|
|
398
|
-
store.Table.next_col_id: self.next_col_id
|
|
399
|
-
})
|
|
400
|
-
.where(store.Table.id == self.id))
|
|
401
|
-
conn.execute(
|
|
402
|
-
sql.insert(store.TableSchemaVersion.__table__)
|
|
403
|
-
.values(
|
|
404
|
-
tbl_id=self.id, schema_version=self.schema_version,
|
|
405
|
-
preceding_schema_version=preceding_schema_version))
|
|
406
|
-
conn.execute(
|
|
407
|
-
sql.insert(store.StorageColumn.__table__)
|
|
408
|
-
.values(tbl_id=self.id, col_id=c.id, schema_version_add=self.schema_version))
|
|
409
|
-
self._create_col_md(conn)
|
|
410
|
-
stmt = f'ALTER TABLE {self.storage_name()} ADD COLUMN {c.to_sql()}'
|
|
411
|
-
conn.execute(sql.text(stmt))
|
|
412
|
-
self._create_sa_tbl()
|
|
413
|
-
|
|
414
|
-
if not c.is_computed or self.count() == 0:
|
|
415
|
-
return
|
|
416
|
-
# backfill the existing rows
|
|
417
|
-
from pixeltable.dataframe import DataFrame
|
|
418
|
-
# use copy to avoid reusing existing execution state
|
|
419
|
-
query = DataFrame(self, [c.value_expr.copy()])
|
|
420
|
-
with Env.get().engine.begin() as conn:
|
|
421
|
-
with tqdm(total=self.count()) as progress_bar:
|
|
422
|
-
for result_row in query.exec(n=0, select_pk=True):
|
|
423
|
-
column_val, rowid, v_min = result_row
|
|
424
|
-
column_val = self._convert_to_stored(c, column_val, rowid)
|
|
425
|
-
conn.execute(
|
|
426
|
-
sql.update(self.sa_tbl)
|
|
427
|
-
.values({c.sa_col: column_val})
|
|
428
|
-
.where(self.rowid_col == rowid)
|
|
429
|
-
.where(self.v_min_col == v_min))
|
|
430
|
-
progress_bar.update(1)
|
|
431
|
-
|
|
432
|
-
def drop_column(self, name: str) -> None:
|
|
433
|
-
self._check_is_dropped()
|
|
434
|
-
if name not in self.cols_by_name:
|
|
435
|
-
raise exc.UnknownEntityError
|
|
436
|
-
col = self.cols_by_name[name]
|
|
437
|
-
if len(col.dependent_cols) > 0:
|
|
438
|
-
raise exc.Error(
|
|
439
|
-
f'Cannot drop column {name} because the following columns depend on it:\n',
|
|
440
|
-
f'{", ".join([c.name for c in col.dependent_cols])}')
|
|
441
|
-
if col.id == self.parameters.frame_col or col.id == self.parameters.frame_idx_col:
|
|
442
|
-
src_col_name = self.cols_by_id[self.parameters.frame_src_col].name
|
|
443
|
-
raise exc.Error(
|
|
444
|
-
f'Cannot drop column {name} because it is used for frame extraction on column {src_col_name}')
|
|
445
|
-
if col.id == self.parameters.frame_src_col:
|
|
446
|
-
# we also need to reset the frame extraction table parameters
|
|
447
|
-
self.parameters.frame_src_col = None
|
|
448
|
-
self.parameters.frame_col = None
|
|
449
|
-
self.parameters.frame_idx_col = None
|
|
450
|
-
self.parameters.extraction_fps = None
|
|
451
|
-
|
|
452
|
-
if col.value_expr is not None:
|
|
453
|
-
# update Column.dependent_cols
|
|
454
|
-
for c in self.cols:
|
|
455
|
-
if c == col:
|
|
456
|
-
break
|
|
457
|
-
try:
|
|
458
|
-
c.dependent_cols.remove(col)
|
|
459
|
-
except ValueError:
|
|
460
|
-
# ignore
|
|
461
|
-
pass
|
|
462
|
-
|
|
463
|
-
self.cols.remove(col)
|
|
464
|
-
del self.cols_by_name[name]
|
|
465
|
-
del self.cols_by_id[col.id]
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
# we're creating a new schema version
|
|
469
|
-
self.version += 1
|
|
470
|
-
preceding_schema_version = self.schema_version
|
|
471
|
-
self.schema_version = self.version
|
|
472
|
-
|
|
473
|
-
with Env.get().engine.begin() as conn:
|
|
474
|
-
conn.execute(
|
|
475
|
-
sql.update(store.Table.__table__)
|
|
476
|
-
.values({
|
|
477
|
-
store.Table.parameters: dataclasses.asdict(self.parameters),
|
|
478
|
-
store.Table.current_version: self.version,
|
|
479
|
-
store.Table.current_schema_version: self.schema_version
|
|
480
|
-
})
|
|
481
|
-
.where(store.Table.id == self.id))
|
|
482
|
-
conn.execute(
|
|
483
|
-
sql.insert(store.TableSchemaVersion.__table__)
|
|
484
|
-
.values(
|
|
485
|
-
tbl_id=self.id, schema_version=self.schema_version,
|
|
486
|
-
preceding_schema_version=preceding_schema_version))
|
|
487
|
-
conn.execute(
|
|
488
|
-
sql.update(store.StorageColumn.__table__)
|
|
489
|
-
.values({store.StorageColumn.schema_version_drop: self.schema_version})
|
|
490
|
-
.where(store.StorageColumn.tbl_id == self.id)
|
|
491
|
-
.where(store.StorageColumn.col_id == col.id))
|
|
492
|
-
self._create_col_md(conn)
|
|
493
|
-
self._create_sa_tbl()
|
|
494
|
-
|
|
495
|
-
def rename_column(self, old_name: str, new_name: str) -> None:
|
|
496
|
-
self._check_is_dropped()
|
|
497
|
-
if old_name not in self.cols_by_name:
|
|
498
|
-
raise exc.UnknownEntityError(f'Unknown column: {old_name}')
|
|
499
|
-
if re.fullmatch(_ID_RE, new_name) is None:
|
|
500
|
-
raise exc.BadFormatError(f"Invalid column name: '{new_name}'")
|
|
501
|
-
if new_name in self.cols_by_name:
|
|
502
|
-
raise exc.DuplicateNameError(f'Column {new_name} already exists')
|
|
503
|
-
col = self.cols_by_name[old_name]
|
|
504
|
-
del self.cols_by_name[old_name]
|
|
505
|
-
col.name = new_name
|
|
506
|
-
self.cols_by_name[new_name] = col
|
|
507
|
-
|
|
508
|
-
# we're creating a new schema version
|
|
509
|
-
self.version += 1
|
|
510
|
-
preceding_schema_version = self.schema_version
|
|
511
|
-
self.schema_version = self.version
|
|
512
|
-
|
|
513
|
-
with Env.get().engine.begin() as conn:
|
|
514
|
-
conn.execute(
|
|
515
|
-
sql.update(store.Table.__table__)
|
|
516
|
-
.values({
|
|
517
|
-
store.Table.current_version: self.version,
|
|
518
|
-
store.Table.current_schema_version: self.schema_version
|
|
519
|
-
})
|
|
520
|
-
.where(store.Table.id == self.id))
|
|
521
|
-
conn.execute(
|
|
522
|
-
sql.insert(store.TableSchemaVersion.__table__)
|
|
523
|
-
.values(tbl_id=self.id, schema_version=self.schema_version,
|
|
524
|
-
preceding_schema_version=preceding_schema_version))
|
|
525
|
-
self._create_col_md(conn)
|
|
526
|
-
|
|
527
|
-
def _create_col_md(self, conn: sql.engine.base.Connection) -> None:
|
|
528
|
-
for pos, c in enumerate(self.cols):
|
|
529
|
-
value_expr_str = c.value_expr.serialize() if c.value_expr is not None else None
|
|
530
|
-
conn.execute(
|
|
531
|
-
sql.insert(store.SchemaColumn.__table__)
|
|
532
|
-
.values(
|
|
533
|
-
tbl_id=self.id, schema_version=self.version, col_id=c.id, pos=pos, name=c.name,
|
|
534
|
-
col_type=c.col_type.serialize(), is_nullable=c.nullable, is_pk=c.primary_key,
|
|
535
|
-
value_expr=value_expr_str, is_indexed=c.is_indexed))
|
|
536
|
-
|
|
537
|
-
def _convert_to_stored(self, col: Column, val: Any, rowid: int) -> Any:
|
|
538
|
-
"""
|
|
539
|
-
Convert column value 'val' into a store-compatible format, if needed:
|
|
540
|
-
- images are stored as files
|
|
541
|
-
- arrays are stored as serialized ndarrays
|
|
542
|
-
"""
|
|
543
|
-
if col.col_type.is_image_type():
|
|
544
|
-
# replace PIL.Image.Image with file path
|
|
545
|
-
img = val
|
|
546
|
-
img_path = utils.get_computed_img_path(self.id, col.id, self.version, rowid)
|
|
547
|
-
img.save(img_path)
|
|
548
|
-
return str(img_path)
|
|
549
|
-
elif col.col_type.is_array_type():
|
|
550
|
-
# serialize numpy array
|
|
551
|
-
np_array = val
|
|
552
|
-
buffer = io.BytesIO()
|
|
553
|
-
np.save(buffer, np_array)
|
|
554
|
-
return buffer.getvalue()
|
|
555
|
-
else:
|
|
556
|
-
return val
|
|
557
|
-
|
|
558
|
-
def insert_rows(self, rows: List[List[Any]], columns: List[str] = []) -> None:
|
|
559
|
-
"""
|
|
560
|
-
Insert rows into table. 'Columns' is a list of column names that specify the columns present in 'rows'.
|
|
561
|
-
'Columns' == empty: all columns are present in 'rows'.
|
|
562
|
-
"""
|
|
563
|
-
assert len(rows) > 0
|
|
564
|
-
if len(rows[0]) != len(self.cols) and len(columns) == 0:
|
|
565
|
-
raise exc.Error(
|
|
566
|
-
f'Table {self.name} has {len(self.cols)} columns, but the data only contains {len(rows[0])} columns. '
|
|
567
|
-
f"In this case, you need to specify the column names with the 'columns' parameter.")
|
|
568
|
-
|
|
569
|
-
# make sure that each row contains the same number of values
|
|
570
|
-
num_col_vals = len(rows[0])
|
|
571
|
-
for i in range(1, len(rows)):
|
|
572
|
-
if len(rows[i]) != num_col_vals:
|
|
573
|
-
raise exc.Error(
|
|
574
|
-
f'Inconsistent number of column values in rows: row 0 has {len(rows[0])}, '
|
|
575
|
-
f'row {i} has {len(rows[i])}')
|
|
576
|
-
|
|
577
|
-
if len(columns) == 0:
|
|
578
|
-
columns = [c.name for c in self.cols]
|
|
579
|
-
if len(rows[0]) != len(columns):
|
|
580
|
-
raise exc.Error(
|
|
581
|
-
f'The number of column values in rows ({len(rows[0])}) does not match the given number of column names '
|
|
582
|
-
f'({len(columns)}')
|
|
583
|
-
|
|
584
|
-
pd_df = pd.DataFrame.from_records(rows, columns=columns)
|
|
585
|
-
self.insert_pandas(pd_df)
|
|
586
|
-
|
|
587
|
-
def insert_pandas(self, data: pd.DataFrame) -> None:
|
|
588
|
-
"""
|
|
589
|
-
If self.parameters.frame_src_col != None:
|
|
590
|
-
- each row (containing a video) is expanded into one row per extracted frame (at the rate of the fps parameter)
|
|
591
|
-
- parameters.frame_col is the image column that receives the extracted frame
|
|
592
|
-
- parameters.frame_idx_col is the integer column that receives the frame index (starting at 0)
|
|
593
|
-
"""
|
|
594
|
-
self._check_is_dropped()
|
|
595
|
-
all_col_names = {col.name for col in self.cols}
|
|
596
|
-
reqd_col_names = {col.name for col in self.cols if not col.nullable and col.value_expr is None}
|
|
597
|
-
if self.parameters.frame_src_col is not None:
|
|
598
|
-
reqd_col_names.discard(self.cols_by_id[self.parameters.frame_col].name)
|
|
599
|
-
reqd_col_names.discard(self.cols_by_id[self.parameters.frame_idx_col].name)
|
|
600
|
-
given_col_names = set(data.columns)
|
|
601
|
-
if not(reqd_col_names <= given_col_names):
|
|
602
|
-
raise exc.InsertError(f'Missing columns: {", ".join(reqd_col_names - given_col_names)}')
|
|
603
|
-
if not(given_col_names <= all_col_names):
|
|
604
|
-
raise exc.InsertError(f'Unknown columns: {", ".join(given_col_names - all_col_names)}')
|
|
605
|
-
computed_col_names = {col.name for col in self.cols if col.value_expr is not None}
|
|
606
|
-
if self.parameters.frame_src_col is not None:
|
|
607
|
-
computed_col_names.add(self.cols_by_id[self.parameters.frame_col].name)
|
|
608
|
-
computed_col_names.add(self.cols_by_id[self.parameters.frame_idx_col].name)
|
|
609
|
-
if len(computed_col_names & given_col_names) > 0:
|
|
610
|
-
raise exc.InsertError(
|
|
611
|
-
f'Provided values for computed columns: {", ".join(computed_col_names & given_col_names)}')
|
|
612
|
-
|
|
613
|
-
# check types
|
|
614
|
-
provided_cols = [self.cols_by_name[name] for name in data.columns]
|
|
615
|
-
for col in provided_cols:
|
|
616
|
-
if col.col_type.is_string_type() and not pd.api.types.is_string_dtype(data.dtypes[col.name]):
|
|
617
|
-
raise exc.InsertError(f'Column {col.name} requires string data but contains {data.dtypes[col.name]}')
|
|
618
|
-
if col.col_type.is_int_type() and not pd.api.types.is_integer_dtype(data.dtypes[col.name]):
|
|
619
|
-
raise exc.InsertError(f'Column {col.name} requires integer data but contains {data.dtypes[col.name]}')
|
|
620
|
-
if col.col_type.is_float_type() and not pd.api.types.is_numeric_dtype(data.dtypes[col.name]):
|
|
621
|
-
raise exc.InsertError(f'Column {col.name} requires numerical data but contains {data.dtypes[col.name]}')
|
|
622
|
-
if col.col_type.is_bool_type() and not pd.api.types.is_bool_dtype(data.dtypes[col.name]):
|
|
623
|
-
raise exc.InsertError(f'Column {col.name} requires boolean data but contains {data.dtypes[col.name]}')
|
|
624
|
-
if col.col_type.is_timestamp_type() and not pd.api.types.is_datetime64_any_dtype(data.dtypes[col.name]):
|
|
625
|
-
raise exc.InsertError(f'Column {col.name} requires datetime data but contains {data.dtypes[col.name]}')
|
|
626
|
-
if col.col_type.is_json_type() and not pd.api.types.is_object_dtype(data.dtypes[col.name]):
|
|
627
|
-
raise exc.InsertError(
|
|
628
|
-
f'Column {col.name} requires dictionary data but contains {data.dtypes[col.name]}')
|
|
629
|
-
if col.col_type.is_array_type() and not pd.api.types.is_object_dtype(data.dtypes[col.name]):
|
|
630
|
-
raise exc.InsertError(
|
|
631
|
-
f'Column {col.name} requires array data but contains {data.dtypes[col.name]}')
|
|
632
|
-
if col.col_type.is_image_type() and not pd.api.types.is_string_dtype(data.dtypes[col.name]):
|
|
633
|
-
raise exc.InsertError(
|
|
634
|
-
f'Column {col.name} requires local file paths but contains {data.dtypes[col.name]}')
|
|
635
|
-
if col.col_type.is_video_type() and not pd.api.types.is_string_dtype(data.dtypes[col.name]):
|
|
636
|
-
raise exc.InsertError(
|
|
637
|
-
f'Column {col.name} requires local file paths but contains {data.dtypes[col.name]}')
|
|
638
|
-
|
|
639
|
-
# check data
|
|
640
|
-
data_cols = [self.cols_by_name[name] for name in data.columns]
|
|
641
|
-
for col in data_cols:
|
|
642
|
-
# image cols: make sure file path points to a valid image file
|
|
643
|
-
if col.col_type.is_image_type():
|
|
644
|
-
for _, path_str in data[col.name].items():
|
|
645
|
-
try:
|
|
646
|
-
_ = Image.open(path_str)
|
|
647
|
-
except FileNotFoundError:
|
|
648
|
-
raise exc.OperationalError(f'Column {col.name}: file does not exist: {path_str}')
|
|
649
|
-
except PIL.UnidentifiedImageError:
|
|
650
|
-
raise exc.OperationalError(f'Column {col.name}: not a valid image file: {path_str}')
|
|
651
|
-
|
|
652
|
-
# image cols: make sure file path points to a valid image file; build index if col is indexed
|
|
653
|
-
if col.col_type.is_video_type():
|
|
654
|
-
for _, path_str in data[col.name].items():
|
|
655
|
-
cap = cv2.VideoCapture(path_str)
|
|
656
|
-
success = cap.isOpened()
|
|
657
|
-
cap.release()
|
|
658
|
-
if not success:
|
|
659
|
-
raise exc.Error(f'Column {col.name}: could not open video file {path_str}')
|
|
660
|
-
|
|
661
|
-
if col.col_type.is_json_type():
|
|
662
|
-
for idx, d in data[col.name].items():
|
|
663
|
-
if not isinstance(d, dict) and not isinstance(d, list):
|
|
664
|
-
raise exc.OperationalError(
|
|
665
|
-
f'Value for column {col.name} in row {idx} requires a dictionary or list: {d} ')
|
|
666
|
-
|
|
667
|
-
# we're creating a new version
|
|
668
|
-
self.version += 1
|
|
669
|
-
|
|
670
|
-
# frame extraction from videos
|
|
671
|
-
if self.parameters.frame_src_col is not None:
|
|
672
|
-
video_col = self.cols_by_id[self.parameters.frame_src_col]
|
|
673
|
-
frame_col = self.cols_by_id[self.parameters.frame_col]
|
|
674
|
-
frame_idx_col = self.cols_by_id[self.parameters.frame_idx_col]
|
|
675
|
-
|
|
676
|
-
# check data: video_column needs to contain valid file paths
|
|
677
|
-
for idx, path_str in data[video_col.name].items():
|
|
678
|
-
path = pathlib.Path(path_str)
|
|
679
|
-
if not path.is_file():
|
|
680
|
-
raise exc.OperationalError(
|
|
681
|
-
f'For frame extraction, value for column {col.name} in row {idx} requires a valid '
|
|
682
|
-
f'file path: {path}')
|
|
683
|
-
|
|
684
|
-
# expand each row in 'data' into one row per frame, adding columns frame_column and frame_idx_column
|
|
685
|
-
expanded_rows: List[Dict] = []
|
|
686
|
-
for input_row_idx, input_tuple in enumerate(data.itertuples(index=False)):
|
|
687
|
-
input_row = input_tuple._asdict()
|
|
688
|
-
path = input_row[video_col.name]
|
|
689
|
-
# we need to generate a unique prefix for each set of frames corresponding to a single video
|
|
690
|
-
frame_path_prefix = utils.get_extracted_frame_path(
|
|
691
|
-
self.id, video_col.id, self.version, self.next_row_id + input_row_idx)
|
|
692
|
-
frame_paths = video.extract_frames(path, frame_path_prefix, self.parameters.extraction_fps)
|
|
693
|
-
frame_rows = [
|
|
694
|
-
{frame_col.name: p, frame_idx_col.name: i, **input_row} for i, p in enumerate(frame_paths)
|
|
695
|
-
]
|
|
696
|
-
expanded_rows.extend(frame_rows)
|
|
697
|
-
data = pd.DataFrame.from_dict(expanded_rows, orient='columns')
|
|
698
|
-
|
|
699
|
-
rowids = range(self.next_row_id, self.next_row_id + len(data))
|
|
700
|
-
|
|
701
|
-
# update image indices
|
|
702
|
-
data_cols = [self.cols_by_name[name] for name in data.columns]
|
|
703
|
-
for col in [c for c in data_cols if c.is_indexed]:
|
|
704
|
-
embeddings = np.zeros((len(data), 512))
|
|
705
|
-
for i, (_, path_str) in enumerate(data[col.name].items()):
|
|
706
|
-
try:
|
|
707
|
-
img = Image.open(path_str)
|
|
708
|
-
embeddings[i] = clip.encode_image(img)
|
|
709
|
-
except FileNotFoundError:
|
|
710
|
-
raise exc.OperationalError(f'Column {col.name}: file does not exist: {path_str}')
|
|
711
|
-
except PIL.UnidentifiedImageError:
|
|
712
|
-
raise exc.OperationalError(f'Column {col.name}: not a valid image file: {path_str}')
|
|
713
|
-
assert col.idx is not None
|
|
714
|
-
col.idx.insert(embeddings, np.array(rowids))
|
|
715
|
-
|
|
716
|
-
# prepare state for computed cols
|
|
717
|
-
from pixeltable import exprs
|
|
718
|
-
eval_ctx: Optional[exprs.ComputedColEvalCtx] = None
|
|
719
|
-
evaluator: Optional[exprs.ExprEvaluator] = None
|
|
720
|
-
input_col_refs: List[exprs.ColumnRef] = [] # columns needed as input for computing value_exprs
|
|
721
|
-
computed_cols = [col for col in self.cols if col.value_expr is not None]
|
|
722
|
-
value_exprs: List[exprs.Expr] = [] # for computed_cols
|
|
723
|
-
window_sort_exprs: List[exprs.Expr] = []
|
|
724
|
-
if len(computed_cols) > 0:
|
|
725
|
-
# create copies to avoid reusing past execution state; eval ctx and evaluator need to share these copies
|
|
726
|
-
value_exprs = [c.value_expr.copy() for c in computed_cols]
|
|
727
|
-
eval_ctx = exprs.ComputedColEvalCtx(
|
|
728
|
-
[(exprs.ColumnRef(computed_cols[i]), value_exprs[i]) for i in range(len(computed_cols))])
|
|
729
|
-
evaluator = exprs.ExprEvaluator(value_exprs, None, with_sql=False)
|
|
730
|
-
input_col_refs = [
|
|
731
|
-
e for e in evaluator.output_eval_exprs
|
|
732
|
-
# we're looking for ColumnRefs to Columns that aren't themselves computed
|
|
733
|
-
if isinstance(e, exprs.ColumnRef) and e.col.value_expr is None
|
|
734
|
-
]
|
|
735
|
-
|
|
736
|
-
# determine order_by clause for window functions, if any
|
|
737
|
-
window_fn_calls = [
|
|
738
|
-
e for e in exprs.Expr.list_subexprs(value_exprs)
|
|
739
|
-
if isinstance(e, exprs.FunctionCall) and e.is_window_fn_call
|
|
740
|
-
]
|
|
741
|
-
window_sort_exprs = window_fn_calls[0].get_window_sort_exprs() if len(window_fn_calls) > 0 else []
|
|
742
|
-
|
|
743
|
-
# construct new df with the storage column names, in order to iterate over it more easily
|
|
744
|
-
stored_data = {col.storage_name(): data[col.name] for col in data_cols}
|
|
745
|
-
stored_data_df = pd.DataFrame(data=stored_data)
|
|
746
|
-
if len(window_sort_exprs) > 0:
|
|
747
|
-
# need to sort data in order to compute windowed agg functions
|
|
748
|
-
storage_col_names = [e.col.storage_name() for e in window_sort_exprs]
|
|
749
|
-
stored_data_df.sort_values(storage_col_names, axis=0, inplace=True)
|
|
750
|
-
insert_values: List[Dict[str, Any]] = []
|
|
751
|
-
with tqdm(total=len(stored_data_df)) as progress_bar:
|
|
752
|
-
for row_idx, row in enumerate(stored_data_df.itertuples(index=False)):
|
|
753
|
-
row_dict = {'rowid': rowids[row_idx], 'v_min': self.version, **row._asdict()}
|
|
754
|
-
|
|
755
|
-
if len(computed_cols) > 0:
|
|
756
|
-
# materialize computed column values
|
|
757
|
-
data_row = [None] * eval_ctx.num_materialized
|
|
758
|
-
# copy inputs
|
|
759
|
-
for col_ref in input_col_refs:
|
|
760
|
-
data_row[col_ref.data_row_idx] = row_dict[col_ref.col.storage_name()]
|
|
761
|
-
# load image, if this is a file path
|
|
762
|
-
if col_ref.col_type.is_image_type():
|
|
763
|
-
data_row[col_ref.data_row_idx] = PIL.Image.open(data_row[col_ref.data_row_idx])
|
|
764
|
-
evaluator.eval((), data_row)
|
|
765
|
-
|
|
766
|
-
# convert data values to storage format where necessary
|
|
767
|
-
for col_idx in range(len(computed_cols)):
|
|
768
|
-
val = data_row[value_exprs[col_idx].data_row_idx]
|
|
769
|
-
data_row[value_exprs[col_idx].data_row_idx] = \
|
|
770
|
-
self._convert_to_stored(computed_cols[col_idx], val, rowids[row_idx])
|
|
771
|
-
|
|
772
|
-
computed_vals_dict = {
|
|
773
|
-
computed_cols[i].storage_name(): data_row[value_exprs[i].data_row_idx]
|
|
774
|
-
for i in range(len(computed_cols))
|
|
775
|
-
}
|
|
776
|
-
row_dict.update(computed_vals_dict)
|
|
777
|
-
|
|
778
|
-
insert_values.append(row_dict)
|
|
779
|
-
progress_bar.update(1)
|
|
780
|
-
|
|
781
|
-
with Env.get().engine.begin() as conn:
|
|
782
|
-
conn.execute(sql.insert(self.sa_tbl), insert_values)
|
|
783
|
-
self.next_row_id += len(data)
|
|
784
|
-
conn.execute(
|
|
785
|
-
sql.update(store.Table.__table__)
|
|
786
|
-
.values({store.Table.current_version: self.version, store.Table.next_row_id: self.next_row_id})
|
|
787
|
-
.where(store.Table.id == self.id))
|
|
788
|
-
|
|
789
|
-
self.valid_rowids.update(rowids)
|
|
790
|
-
|
|
791
|
-
def insert_csv(self, file_path: str) -> None:
|
|
792
|
-
pass
|
|
793
|
-
|
|
794
|
-
# TODO: update() signature?
|
|
795
|
-
#def update(self, data: pd.DataFrame) -> None:
|
|
796
|
-
|
|
797
|
-
# TODO: delete() signature?
|
|
798
|
-
#def delete(self, data: DataFrame) -> None:
|
|
799
|
-
|
|
800
|
-
def _delete_computed_imgs(self, version: int) -> None:
|
|
801
|
-
"""
|
|
802
|
-
Delete image files computed for given version.
|
|
803
|
-
"""
|
|
804
|
-
img_paths = utils.computed_imgs(tbl_id=self.id, version=version)
|
|
805
|
-
for p in img_paths:
|
|
806
|
-
os.remove(p)
|
|
807
|
-
return
|
|
808
|
-
|
|
809
|
-
def _delete_extracted_frames(self, version: int) -> None:
|
|
810
|
-
"""
|
|
811
|
-
Delete extracted frames for given version.
|
|
812
|
-
"""
|
|
813
|
-
frame_paths = utils.extracted_frames(tbl_id=self.id, version=version)
|
|
814
|
-
for p in frame_paths:
|
|
815
|
-
os.remove(p)
|
|
816
|
-
|
|
817
|
-
def revert(self) -> None:
|
|
818
|
-
self._check_is_dropped()
|
|
819
|
-
if self.version == 0:
|
|
820
|
-
raise exc.OperationalError('Cannot revert version 0')
|
|
821
|
-
# check if the current version is referenced by a snapshot
|
|
822
|
-
with orm.Session(Env.get().engine) as session:
|
|
823
|
-
# make sure we don't have a snapshot referencing this version
|
|
824
|
-
num_references = session.query(sql.func.count(store.TableSnapshot.id)) \
|
|
825
|
-
.where(store.TableSnapshot.db_id == self.db_id) \
|
|
826
|
-
.where(store.TableSnapshot.tbl_id == self.id) \
|
|
827
|
-
.where(store.TableSnapshot.tbl_version == self.version) \
|
|
828
|
-
.scalar()
|
|
829
|
-
if num_references > 0:
|
|
830
|
-
raise exc.OperationalError(
|
|
831
|
-
f'Current version is needed for {num_references} snapshot{"s" if num_references > 1 else ""}')
|
|
832
|
-
|
|
833
|
-
conn = session.connection()
|
|
834
|
-
# delete newly-added data
|
|
835
|
-
self._delete_computed_imgs(self.version)
|
|
836
|
-
self._delete_extracted_frames(self.version)
|
|
837
|
-
conn.execute(sql.delete(self.sa_tbl).where(self.sa_tbl.c.v_min == self.version))
|
|
838
|
-
# revert new deletions
|
|
839
|
-
conn.execute(
|
|
840
|
-
sql.update(self.sa_tbl).values({self.sa_tbl.c.v_max: store.Table.MAX_VERSION})
|
|
841
|
-
.where(self.sa_tbl.c.v_max == self.version))
|
|
842
|
-
|
|
843
|
-
if self.version == self.schema_version:
|
|
844
|
-
# the current version involved a schema change:
|
|
845
|
-
# we need to determine the preceding schema version and reload the schema
|
|
846
|
-
preceding_schema_version = session.query(store.TableSchemaVersion.preceding_schema_version) \
|
|
847
|
-
.where(store.TableSchemaVersion.tbl_id == self.id) \
|
|
848
|
-
.where(store.TableSchemaVersion.schema_version == self.schema_version) \
|
|
849
|
-
.scalar()
|
|
850
|
-
self.cols = self.load_cols(self.id, preceding_schema_version, session)
|
|
851
|
-
conn.execute(
|
|
852
|
-
sql.delete(store.TableSchemaVersion.__table__)
|
|
853
|
-
.where(store.TableSchemaVersion.tbl_id == self.id)
|
|
854
|
-
.where(store.TableSchemaVersion.schema_version == self.schema_version))
|
|
855
|
-
self.schema_version = preceding_schema_version
|
|
856
|
-
|
|
857
|
-
conn.execute(
|
|
858
|
-
sql.update(store.Table.__table__)
|
|
859
|
-
.values({
|
|
860
|
-
store.Table.current_version: self.version,
|
|
861
|
-
store.Table.current_schema_version: self.schema_version
|
|
862
|
-
})
|
|
863
|
-
.where(store.Table.id == self.id))
|
|
864
|
-
|
|
865
|
-
session.commit()
|
|
866
|
-
self.version -= 1
|
|
867
|
-
|
|
868
|
-
# MODULE-LOCAL, NOT PUBLIC
|
|
869
|
-
def rename(self, new_name: str) -> None:
|
|
870
|
-
self._check_is_dropped()
|
|
871
|
-
with Env.get().engine.begin() as conn:
|
|
872
|
-
conn.execute(
|
|
873
|
-
sql.update(store.Table.__table__).values({store.Table.name: new_name})
|
|
874
|
-
.where(store.Table.id == self.id))
|
|
875
|
-
|
|
876
|
-
# MODULE-LOCAL, NOT PUBLIC
|
|
877
|
-
def drop(self) -> None:
|
|
878
|
-
self._check_is_dropped()
|
|
879
|
-
with Env.get().engine.begin() as conn:
|
|
880
|
-
conn.execute(
|
|
881
|
-
sql.update(store.Table.__table__).values({store.Table.is_mutable: False})
|
|
882
|
-
.where(store.Table.id == self.id))
|
|
883
|
-
|
|
884
|
-
@classmethod
|
|
885
|
-
def _create_value_expr(cls, col: Column, existing_cols: Dict[str, Column]) -> None:
|
|
886
|
-
"""
|
|
887
|
-
Create col.value_expr, given col.compute_func.
|
|
888
|
-
Interprets compute_func's parameters to be references to columns and construct ColumnRefs as args.
|
|
889
|
-
Does not update Column.dependent_cols.
|
|
890
|
-
"""
|
|
891
|
-
assert col.value_expr is None
|
|
892
|
-
assert col.compute_func is not None
|
|
893
|
-
from pixeltable import exprs
|
|
894
|
-
params = inspect.signature(col.compute_func).parameters
|
|
895
|
-
args: List[exprs.ColumnRef] = []
|
|
896
|
-
for param_name in params:
|
|
897
|
-
if param_name not in existing_cols:
|
|
898
|
-
raise exc.Error(
|
|
899
|
-
f'Column {col.name}: compute_with parameter refers to an unknown column: {param_name}')
|
|
900
|
-
args.append(exprs.ColumnRef(existing_cols[param_name]))
|
|
901
|
-
fn = Function(col.col_type, [arg.col_type for arg in args], eval_fn=col.compute_func)
|
|
902
|
-
col.value_expr = exprs.FunctionCall(fn, args)
|
|
903
|
-
|
|
904
|
-
# MODULE-LOCAL, NOT PUBLIC
|
|
905
|
-
@classmethod
|
|
906
|
-
def create(
|
|
907
|
-
cls, db_id: int, dir_id: int, name: str, cols: List[Column],
|
|
908
|
-
num_retained_versions: int,
|
|
909
|
-
extract_frames_from: Optional[str], extracted_frame_col: Optional[str], extracted_frame_idx_col: Optional[str],
|
|
910
|
-
extracted_fps: Optional[int]
|
|
911
|
-
) -> 'MutableTable':
|
|
912
|
-
# make sure col names are unique (within the table) and assign ids
|
|
913
|
-
cols_by_name: Dict[str, Column] = {}
|
|
914
|
-
for pos, c in enumerate(cols):
|
|
915
|
-
if c.name in cols_by_name:
|
|
916
|
-
raise exc.DuplicateNameError(f'Duplicate column: {c.name}')
|
|
917
|
-
c.id = pos
|
|
918
|
-
cols_by_name[c.name] = c
|
|
919
|
-
|
|
920
|
-
# check frame extraction params, if present
|
|
921
|
-
if extract_frames_from is not None:
|
|
922
|
-
assert extracted_frame_col is not None and extracted_frame_idx_col is not None and extracted_fps is not None
|
|
923
|
-
if extract_frames_from is not None and extract_frames_from not in cols_by_name:
|
|
924
|
-
raise exc.BadFormatError(f'Unknown column in extract_frames_from: {extract_frames_from}')
|
|
925
|
-
col_type = cols_by_name[extract_frames_from].col_type
|
|
926
|
-
is_nullable = cols_by_name[extract_frames_from].nullable
|
|
927
|
-
if not col_type.is_video_type():
|
|
928
|
-
raise exc.BadFormatError(
|
|
929
|
-
f'extract_frames_from requires the name of a column of type video, but {extract_frames_from} has '
|
|
930
|
-
f'type {col_type}')
|
|
931
|
-
if extracted_frame_col is not None and extracted_frame_col not in cols_by_name:
|
|
932
|
-
raise exc.BadFormatError(f'Unknown column in extracted_frame_col: {extracted_frame_col}')
|
|
933
|
-
col_type = cols_by_name[extracted_frame_col].col_type
|
|
934
|
-
if not col_type.is_image_type():
|
|
935
|
-
raise exc.BadFormatError(
|
|
936
|
-
f'extracted_frame_col requires the name of a column of type image, but {extracted_frame_col} has '
|
|
937
|
-
f'type {col_type}')
|
|
938
|
-
# the src column determines whether the frame column is nullable
|
|
939
|
-
cols_by_name[extracted_frame_col].nullable = is_nullable
|
|
940
|
-
if extracted_frame_idx_col is not None and extracted_frame_idx_col not in cols_by_name:
|
|
941
|
-
raise exc.BadFormatError(f'Unknown column in extracted_frame_idx_col: {extracted_frame_idx_col}')
|
|
942
|
-
col_type = cols_by_name[extracted_frame_idx_col].col_type
|
|
943
|
-
if not col_type.is_int_type():
|
|
944
|
-
raise exc.BadFormatError(
|
|
945
|
-
f'extracted_frame_idx_col requires the name of a column of type int, but {extracted_frame_idx_col} '
|
|
946
|
-
f'has type {col_type}')
|
|
947
|
-
# the src column determines whether the frame idx column is nullable
|
|
948
|
-
cols_by_name[extracted_frame_idx_col].nullable = is_nullable
|
|
949
|
-
|
|
950
|
-
params = TableParameters(
|
|
951
|
-
num_retained_versions,
|
|
952
|
-
cols_by_name[extract_frames_from].id if extract_frames_from is not None else None,
|
|
953
|
-
cols_by_name[extracted_frame_col].id if extracted_frame_col is not None else None,
|
|
954
|
-
cols_by_name[extracted_frame_idx_col].id if extracted_frame_idx_col is not None else None,
|
|
955
|
-
extracted_fps)
|
|
956
|
-
|
|
957
|
-
with orm.Session(Env.get().engine) as session:
|
|
958
|
-
tbl_record = store.Table(
|
|
959
|
-
db_id=db_id, dir_id=dir_id, name=name, parameters=dataclasses.asdict(params), current_version=0,
|
|
960
|
-
current_schema_version=0, is_mutable=True, next_col_id=len(cols), next_row_id=0)
|
|
961
|
-
session.add(tbl_record)
|
|
962
|
-
session.flush() # sets tbl_record.id
|
|
963
|
-
|
|
964
|
-
tbl_version_record = store.TableSchemaVersion(
|
|
965
|
-
tbl_id=tbl_record.id, schema_version=0, preceding_schema_version=0)
|
|
966
|
-
session.add(tbl_version_record)
|
|
967
|
-
session.flush() # avoid FK violations in Postgres
|
|
968
|
-
print(f'creating table {name}, id={tbl_record.id}')
|
|
969
|
-
|
|
970
|
-
cols_by_name: Dict[str, Column] = {} # records the cols we have seen so far
|
|
971
|
-
for pos, col in enumerate(cols):
|
|
972
|
-
session.add(store.StorageColumn(tbl_id=tbl_record.id, col_id=col.id, schema_version_add=0))
|
|
973
|
-
session.flush() # avoid FK violations in Postgres
|
|
974
|
-
if col.value_expr is None and col.compute_func is not None:
|
|
975
|
-
cls._create_value_expr(col, cols_by_name)
|
|
976
|
-
# Column.dependent_cols for existing cols is wrong at this point, but Table.init() will set it correctly
|
|
977
|
-
value_expr_str = col.value_expr.serialize() if col.value_expr is not None else None
|
|
978
|
-
session.add(
|
|
979
|
-
store.SchemaColumn(
|
|
980
|
-
tbl_id=tbl_record.id, schema_version=0, col_id=col.id, pos=pos, name=col.name,
|
|
981
|
-
col_type=col.col_type.serialize(), is_nullable=col.nullable, is_pk=col.primary_key,
|
|
982
|
-
value_expr=value_expr_str, is_indexed=col.is_indexed
|
|
983
|
-
)
|
|
984
|
-
)
|
|
985
|
-
session.flush() # avoid FK violations in Postgres
|
|
986
|
-
|
|
987
|
-
# for image cols, add VectorIndex for kNN search
|
|
988
|
-
if col.is_indexed and col.col_type.is_image_type():
|
|
989
|
-
col.set_idx(VectorIndex.create(Table._vector_idx_name(tbl_record.id, col), 512))
|
|
990
|
-
|
|
991
|
-
cols_by_name[col.name] = col
|
|
992
|
-
session.flush()
|
|
993
|
-
|
|
994
|
-
assert tbl_record.id is not None
|
|
995
|
-
tbl = MutableTable(tbl_record, 0, cols)
|
|
996
|
-
tbl.sa_md.create_all(bind=session.connection())
|
|
997
|
-
session.commit()
|
|
998
|
-
return tbl
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
class Path:
|
|
1002
|
-
def __init__(self, path: str, empty_is_valid: bool=False):
|
|
1003
|
-
if path == '' and not empty_is_valid or path != '' and re.fullmatch(_PATH_RE, path) is None:
|
|
1004
|
-
raise exc.BadFormatError(f"Invalid path format: '{path}'")
|
|
1005
|
-
self.components = path.split('.')
|
|
1006
|
-
|
|
1007
|
-
@property
|
|
1008
|
-
def len(self) -> int:
|
|
1009
|
-
return 0 if self.is_root else len(self.components)
|
|
1010
|
-
|
|
1011
|
-
@property
|
|
1012
|
-
def name(self) -> str:
|
|
1013
|
-
assert len(self.components) > 0
|
|
1014
|
-
return self.components[-1]
|
|
1015
|
-
|
|
1016
|
-
@property
|
|
1017
|
-
def is_root(self) -> bool:
|
|
1018
|
-
return self.components[0] == ''
|
|
1019
|
-
|
|
1020
|
-
@property
|
|
1021
|
-
def parent(self) -> 'Path':
|
|
1022
|
-
if len(self.components) == 1:
|
|
1023
|
-
if self.is_root:
|
|
1024
|
-
return self
|
|
1025
|
-
else:
|
|
1026
|
-
return Path('', empty_is_valid=True)
|
|
1027
|
-
else:
|
|
1028
|
-
return Path('.'.join(self.components[:-1]))
|
|
1029
|
-
|
|
1030
|
-
def append(self, name: str) -> 'Path':
|
|
1031
|
-
if self.is_root:
|
|
1032
|
-
return Path(name)
|
|
1033
|
-
else:
|
|
1034
|
-
return Path(f'{str(self)}.{name}')
|
|
1035
|
-
|
|
1036
|
-
def is_ancestor(self, other: 'Path', is_parent: bool = False) -> bool:
|
|
1037
|
-
"""
|
|
1038
|
-
True if self as an ancestor path of other.
|
|
1039
|
-
"""
|
|
1040
|
-
if self.len >= other.len or other.is_root:
|
|
1041
|
-
return False
|
|
1042
|
-
if self.is_root and (other.len == 1 or not is_parent):
|
|
1043
|
-
return True
|
|
1044
|
-
is_prefix = self.components == other.components[:self.len]
|
|
1045
|
-
return is_prefix and (self.len == (other.len - 1) or not is_parent)
|
|
1046
|
-
|
|
1047
|
-
def __str__(self) -> str:
|
|
1048
|
-
return '.'.join(self.components)
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
class PathDict:
|
|
1052
|
-
def __init__(self) -> None:
|
|
1053
|
-
# *not* Dict[Path, SchemaObject]
|
|
1054
|
-
self.paths: Dict[str, SchemaObject] = {} # all paths
|
|
1055
|
-
|
|
1056
|
-
def __getitem__(self, path: Path) -> SchemaObject:
|
|
1057
|
-
return self.paths[str(path)]
|
|
1058
|
-
|
|
1059
|
-
def __setitem__(self, path: Path, val: SchemaObject) -> None:
|
|
1060
|
-
self.paths[str(path)] = val
|
|
1061
|
-
|
|
1062
|
-
def __delitem__(self, path: Path) -> None:
|
|
1063
|
-
del self.paths[str(path)]
|
|
1064
|
-
|
|
1065
|
-
def update(self, paths: Dict[str, SchemaObject]) -> None:
|
|
1066
|
-
self.paths.update(paths)
|
|
1067
|
-
|
|
1068
|
-
# checks that the parent of path exists and is a Dir
|
|
1069
|
-
# and that the object of path has 'expected' type
|
|
1070
|
-
def check_is_valid(
|
|
1071
|
-
self, path: Path, expected: Optional[Type[SchemaObject]],
|
|
1072
|
-
expected_parent_type: Type[DirBase] = DirBase) -> None:
|
|
1073
|
-
path_str = str(path)
|
|
1074
|
-
# check for existence
|
|
1075
|
-
if expected is not None:
|
|
1076
|
-
if path_str not in self.paths:
|
|
1077
|
-
raise exc.UnknownEntityError(path_str)
|
|
1078
|
-
obj = self.paths[path_str]
|
|
1079
|
-
if not isinstance(obj, expected):
|
|
1080
|
-
raise exc.UnknownEntityError(f'{path_str} needs to be a {expected.display_name()}')
|
|
1081
|
-
if expected is None and path_str in self.paths:
|
|
1082
|
-
raise exc.DuplicateNameError(f'{path_str} already exists')
|
|
1083
|
-
# check for containing directory
|
|
1084
|
-
parent_path = path.parent
|
|
1085
|
-
if str(parent_path) not in self.paths:
|
|
1086
|
-
raise exc.UnknownEntityError(f'Directory {str(parent_path)}')
|
|
1087
|
-
parent = self.paths[str(parent_path)]
|
|
1088
|
-
if not isinstance(parent, expected_parent_type):
|
|
1089
|
-
raise exc.UnknownEntityError(f'{str(parent_path)} needs to be a {expected_parent_type.display_name()}')
|
|
1090
|
-
|
|
1091
|
-
def get(self, path_type: Type[SchemaObject]) -> List[Path]:
|
|
1092
|
-
return [obj for _, obj in self.paths.items() if isinstance(obj, path_type)]
|
|
1093
|
-
|
|
1094
|
-
def get_children(self, parent: Path, child_type: Optional[Type[SchemaObject]], recursive: bool) -> List[Path]:
|
|
1095
|
-
candidates = [
|
|
1096
|
-
Path(path, empty_is_valid=True)
|
|
1097
|
-
for path, obj in self.paths.items() if child_type is None or isinstance(obj, child_type)
|
|
1098
|
-
]
|
|
1099
|
-
result = [path for path in candidates if parent.is_ancestor(path, is_parent=(not recursive))]
|
|
1100
|
-
return result
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
class Db:
|
|
1104
|
-
def __init__(self, db_id: int, name: str):
|
|
1105
|
-
self.id = db_id
|
|
1106
|
-
self.name = name
|
|
1107
|
-
self.paths = PathDict()
|
|
1108
|
-
self.paths.update(self._load_dirs())
|
|
1109
|
-
self.paths.update(self._load_tables())
|
|
1110
|
-
self.paths.update(self._load_function_md())
|
|
1111
|
-
|
|
1112
|
-
def create_table(
|
|
1113
|
-
self, path_str: str, schema: List[Column], num_retained_versions: int = 10,
|
|
1114
|
-
extract_frames_from: Optional[str] = None, extracted_frame_col: Optional[str] = None,
|
|
1115
|
-
extracted_frame_idx_col: Optional[str] = None, extracted_fps: Optional[int] = None
|
|
1116
|
-
) -> MutableTable:
|
|
1117
|
-
path = Path(path_str)
|
|
1118
|
-
self.paths.check_is_valid(path, expected=None, expected_parent_type=Dir)
|
|
1119
|
-
dir = self.paths[path.parent]
|
|
1120
|
-
|
|
1121
|
-
# make sure frame extraction params are either fully present or absent
|
|
1122
|
-
frame_extraction_param_count = int(extract_frames_from is not None) + int(extracted_frame_col is not None)\
|
|
1123
|
-
+ int(extracted_frame_idx_col is not None) + int(extracted_fps is not None)
|
|
1124
|
-
if frame_extraction_param_count != 0 and frame_extraction_param_count != 4:
|
|
1125
|
-
raise exc.BadFormatError(
|
|
1126
|
-
'Frame extraction requires that all parameters (extract_frames_from, extracted_frame_col, '
|
|
1127
|
-
'extracted_frame_idx_col, extracted_fps) be specified')
|
|
1128
|
-
tbl = MutableTable.create(
|
|
1129
|
-
self.id, dir.id, path.name, schema, num_retained_versions, extract_frames_from, extracted_frame_col,
|
|
1130
|
-
extracted_frame_idx_col, extracted_fps)
|
|
1131
|
-
self.paths[path] = tbl
|
|
1132
|
-
return tbl
|
|
1133
|
-
|
|
1134
|
-
def get_table(self, path_str: str) -> Table:
|
|
1135
|
-
path = Path(path_str)
|
|
1136
|
-
self.paths.check_is_valid(path, expected=Table)
|
|
1137
|
-
obj = self.paths[path]
|
|
1138
|
-
assert isinstance(obj, Table)
|
|
1139
|
-
return obj
|
|
1140
|
-
|
|
1141
|
-
def rename_table(self, path_str: str, new_name: str) -> None:
|
|
1142
|
-
path = Path(path_str)
|
|
1143
|
-
self.paths.check_is_valid(path, expected=MutableTable)
|
|
1144
|
-
if re.fullmatch(_ID_RE, new_name) is None:
|
|
1145
|
-
raise exc.BadFormatError(f"Invalid table name: '{new_name}'")
|
|
1146
|
-
new_path = path.parent.append(new_name)
|
|
1147
|
-
self.paths.check_is_valid(new_path, expected=None, expected_parent_type=Dir)
|
|
1148
|
-
|
|
1149
|
-
tbl = self.paths[path]
|
|
1150
|
-
assert isinstance(tbl, MutableTable)
|
|
1151
|
-
del self.paths[path]
|
|
1152
|
-
self.paths[new_path] = tbl
|
|
1153
|
-
tbl.rename(new_name)
|
|
1154
|
-
|
|
1155
|
-
def move_table(self, tbl_path: str, dir_path: str) -> None:
|
|
1156
|
-
pass
|
|
1157
|
-
|
|
1158
|
-
def list_tables(self, dir_path: str = '', recursive: bool = True) -> List[str]:
|
|
1159
|
-
assert dir_path is not None
|
|
1160
|
-
path = Path(dir_path, empty_is_valid=True)
|
|
1161
|
-
self.paths.check_is_valid(path, expected=DirBase)
|
|
1162
|
-
return [str(p) for p in self.paths.get_children(path, child_type=Table, recursive=recursive)]
|
|
1163
|
-
|
|
1164
|
-
def drop_table(self, path_str: str, force: bool = False, ignore_errors: bool = False) -> None:
|
|
1165
|
-
path = Path(path_str)
|
|
1166
|
-
try:
|
|
1167
|
-
self.paths.check_is_valid(path, expected=MutableTable)
|
|
1168
|
-
except Exception as e:
|
|
1169
|
-
if ignore_errors:
|
|
1170
|
-
return
|
|
1171
|
-
else:
|
|
1172
|
-
raise e
|
|
1173
|
-
tbl = self.paths[path]
|
|
1174
|
-
assert isinstance(tbl, MutableTable)
|
|
1175
|
-
tbl.drop()
|
|
1176
|
-
del self.paths[path]
|
|
1177
|
-
|
|
1178
|
-
def create_snapshot(self, path_str: str, tbl_paths: List[str]) -> None:
|
|
1179
|
-
snapshot_dir_path = Path(path_str)
|
|
1180
|
-
self.paths.check_is_valid(snapshot_dir_path, expected=None, expected_parent_type=Dir)
|
|
1181
|
-
tbls: List[MutableTable] = []
|
|
1182
|
-
for tbl_path_str in tbl_paths:
|
|
1183
|
-
tbl_path = Path(tbl_path_str)
|
|
1184
|
-
self.paths.check_is_valid(tbl_path, expected=MutableTable)
|
|
1185
|
-
tbl = self.paths[tbl_path]
|
|
1186
|
-
assert isinstance(tbl, MutableTable)
|
|
1187
|
-
tbls.append(tbl)
|
|
1188
|
-
|
|
1189
|
-
with orm.Session(Env.get().engine) as session:
|
|
1190
|
-
dir_record = store.Dir(db_id=self.id, path=path_str, is_snapshot=True)
|
|
1191
|
-
session.add(dir_record)
|
|
1192
|
-
session.flush()
|
|
1193
|
-
assert dir_record.id is not None
|
|
1194
|
-
self.paths[snapshot_dir_path] = Dir(dir_record.id)
|
|
1195
|
-
|
|
1196
|
-
for tbl in tbls:
|
|
1197
|
-
snapshot_record = store.TableSnapshot(
|
|
1198
|
-
db_id=self.id, dir_id=dir_record.id, name=tbl.name, tbl_id=tbl.id, tbl_version=tbl.version,
|
|
1199
|
-
tbl_schema_version=tbl.schema_version)
|
|
1200
|
-
session.add(snapshot_record)
|
|
1201
|
-
session.flush()
|
|
1202
|
-
assert snapshot_record.id is not None
|
|
1203
|
-
cols = Table.load_cols(tbl.id, tbl.schema_version, session)
|
|
1204
|
-
snapshot = TableSnapshot(snapshot_record, cols)
|
|
1205
|
-
snapshot_path = snapshot_dir_path.append(tbl.name)
|
|
1206
|
-
self.paths[snapshot_path] = snapshot
|
|
1207
|
-
|
|
1208
|
-
session.commit()
|
|
1209
|
-
|
|
1210
|
-
def create_dir(self, path_str: str) -> None:
|
|
1211
|
-
path = Path(path_str)
|
|
1212
|
-
self.paths.check_is_valid(path, expected=None, expected_parent_type=Dir)
|
|
1213
|
-
with orm.Session(Env.get().engine) as session:
|
|
1214
|
-
dir_record = store.Dir(db_id=self.id, path=path_str, is_snapshot=False)
|
|
1215
|
-
session.add(dir_record)
|
|
1216
|
-
session.flush()
|
|
1217
|
-
assert dir_record.id is not None
|
|
1218
|
-
self.paths[path] = Dir(dir_record.id)
|
|
1219
|
-
session.commit()
|
|
1220
|
-
|
|
1221
|
-
def rm_dir(self, path_str: str) -> None:
|
|
1222
|
-
path = Path(path_str)
|
|
1223
|
-
self.paths.check_is_valid(path, expected=Dir)
|
|
1224
|
-
|
|
1225
|
-
# make sure it's empty
|
|
1226
|
-
if len(self.paths.get_children(path, child_type=None, recursive=True)) > 0:
|
|
1227
|
-
raise exc.DirectoryNotEmptyError(f'Directory {path_str}')
|
|
1228
|
-
# TODO: figure out how to make force=True work in the presence of snapshots
|
|
1229
|
-
# # delete tables
|
|
1230
|
-
# for tbl_path in self.paths.get_children(path, child_type=Table, recursive=True):
|
|
1231
|
-
# self.drop_table(str(tbl_path), force=True)
|
|
1232
|
-
# # rm subdirs
|
|
1233
|
-
# for dir_path in self.paths.get_children(path, child_type=DirBase, recursive=False):
|
|
1234
|
-
# self.rm_dir(str(dir_path), force=True)
|
|
1235
|
-
|
|
1236
|
-
with Env.get().engine.begin() as conn:
|
|
1237
|
-
dir = self.paths[path]
|
|
1238
|
-
conn.execute(sql.delete(store.Dir.__table__).where(store.Dir.id == dir.id))
|
|
1239
|
-
del self.paths[path]
|
|
1240
|
-
|
|
1241
|
-
def list_dirs(self, path_str: str = '', recursive: bool = True) -> List[str]:
|
|
1242
|
-
path = Path(path_str, empty_is_valid=True)
|
|
1243
|
-
self.paths.check_is_valid(path, expected=DirBase)
|
|
1244
|
-
return [str(p) for p in self.paths.get_children(path, child_type=DirBase, recursive=recursive)]
|
|
1245
|
-
|
|
1246
|
-
def create_function(self, path_str: str, func: Function) -> None:
|
|
1247
|
-
if func.is_library_function:
|
|
1248
|
-
raise exc.Error(f'Cannot create a named function for a library function')
|
|
1249
|
-
path = Path(path_str)
|
|
1250
|
-
self.paths.check_is_valid(path, expected=None, expected_parent_type=Dir)
|
|
1251
|
-
dir = self.paths[path.parent]
|
|
1252
|
-
|
|
1253
|
-
FunctionRegistry.get().create_function(func, self.id, dir.id, path.name)
|
|
1254
|
-
self.paths[path] = NamedFunction(func.id, dir.id, path.name)
|
|
1255
|
-
|
|
1256
|
-
def rename_function(self, path_str: str, new_path_str: str) -> None:
|
|
1257
|
-
"""
|
|
1258
|
-
Assign a new name and/or move the function to a different directory.
|
|
1259
|
-
"""
|
|
1260
|
-
path = Path(path_str)
|
|
1261
|
-
new_path = Path(new_path_str)
|
|
1262
|
-
self.paths.check_is_valid(path, expected=NamedFunction)
|
|
1263
|
-
self.paths.check_is_valid(new_path, expected=None)
|
|
1264
|
-
func = self.paths[path]
|
|
1265
|
-
new_dir = self.paths[new_path.parent]
|
|
1266
|
-
with Env.get().engine.begin() as conn:
|
|
1267
|
-
conn.execute(
|
|
1268
|
-
sql.update(store.Function.__table__)
|
|
1269
|
-
.values({
|
|
1270
|
-
store.Function.dir_id: new_dir.id,
|
|
1271
|
-
store.Function.name: new_path.name,
|
|
1272
|
-
})
|
|
1273
|
-
.where(store.Function.id == func.id))
|
|
1274
|
-
del self.paths[path]
|
|
1275
|
-
self.paths[new_path] = func
|
|
1276
|
-
|
|
1277
|
-
def update_function(self, path_str: str, new_eval_fn: Callable) -> None:
|
|
1278
|
-
"""
|
|
1279
|
-
Update the Function for given path with the callable.
|
|
1280
|
-
"""
|
|
1281
|
-
path = Path(path_str)
|
|
1282
|
-
self.paths.check_is_valid(path, expected=NamedFunction)
|
|
1283
|
-
named_fn = self.paths[path]
|
|
1284
|
-
# TODO: check that function signature doesn't change if the Function is used in a computed column
|
|
1285
|
-
FunctionRegistry.get().update_function(named_fn.id, new_eval_fn)
|
|
1286
|
-
|
|
1287
|
-
def load_function(self, path_str: str) -> Function:
|
|
1288
|
-
path = Path(path_str)
|
|
1289
|
-
self.paths.check_is_valid(path, expected=NamedFunction)
|
|
1290
|
-
named_fn = self.paths[path]
|
|
1291
|
-
assert isinstance(named_fn, NamedFunction)
|
|
1292
|
-
return FunctionRegistry.get().get_function(named_fn.id)
|
|
1293
|
-
|
|
1294
|
-
def drop_function(self, path_str: str, ignore_errors: bool = False) -> None:
|
|
1295
|
-
"""
|
|
1296
|
-
Deletes function from db, provided that no computed columns depend on it.
|
|
1297
|
-
"""
|
|
1298
|
-
path = Path(path_str)
|
|
1299
|
-
try:
|
|
1300
|
-
self.paths.check_is_valid(path, expected=NamedFunction)
|
|
1301
|
-
except exc.UnknownEntityError as e:
|
|
1302
|
-
if ignore_errors:
|
|
1303
|
-
return
|
|
1304
|
-
else:
|
|
1305
|
-
raise e
|
|
1306
|
-
named_fn = self.paths[path]
|
|
1307
|
-
FunctionRegistry.get().delete_function(named_fn.id)
|
|
1308
|
-
del self.paths[path]
|
|
1309
|
-
|
|
1310
|
-
def _load_dirs(self) -> Dict[str, SchemaObject]:
|
|
1311
|
-
result: Dict[str, SchemaObject] = {}
|
|
1312
|
-
with orm.Session(Env.get().engine) as session:
|
|
1313
|
-
for dir_record in session.query(store.Dir).where(store.Dir.db_id == self.id).all():
|
|
1314
|
-
result[dir_record.path] = SnapshotDir(dir_record.id) if dir_record.is_snapshot else Dir(dir_record.id)
|
|
1315
|
-
return result
|
|
1316
|
-
|
|
1317
|
-
def _load_tables(self) -> Dict[str, SchemaObject]:
|
|
1318
|
-
result: Dict[str, SchemaObject] = {}
|
|
1319
|
-
with orm.Session(Env.get().engine) as session:
|
|
1320
|
-
# load all reachable (= mutable) tables
|
|
1321
|
-
q = session.query(store.Table, store.Dir.path) \
|
|
1322
|
-
.join(store.Dir)\
|
|
1323
|
-
.where(store.Table.db_id == self.id) \
|
|
1324
|
-
.where(store.Table.is_mutable == True)
|
|
1325
|
-
for tbl_record, dir_path in q.all():
|
|
1326
|
-
cols = Table.load_cols(
|
|
1327
|
-
tbl_record.id, tbl_record.current_schema_version, session)
|
|
1328
|
-
tbl = MutableTable(tbl_record, tbl_record.current_schema_version, cols)
|
|
1329
|
-
tbl._load_valid_rowids() # TODO: move this someplace more appropriate
|
|
1330
|
-
path = Path(dir_path, empty_is_valid=True).append(tbl_record.name)
|
|
1331
|
-
result[str(path)] = tbl
|
|
1332
|
-
|
|
1333
|
-
# load all table snapshots
|
|
1334
|
-
q = session.query(store.TableSnapshot, store.Dir.path) \
|
|
1335
|
-
.select_from(store.TableSnapshot) \
|
|
1336
|
-
.join(store.Table) \
|
|
1337
|
-
.join(store.Dir) \
|
|
1338
|
-
.where(store.TableSnapshot.db_id == self.id)
|
|
1339
|
-
for snapshot_record, dir_path in q.all():
|
|
1340
|
-
cols = Table.load_cols(snapshot_record.tbl_id, snapshot_record.tbl_schema_version, session)
|
|
1341
|
-
snapshot = TableSnapshot(snapshot_record, cols)
|
|
1342
|
-
path = Path(dir_path, empty_is_valid=True).append(snapshot_record.name)
|
|
1343
|
-
result[str(path)] = snapshot
|
|
1344
|
-
|
|
1345
|
-
return result
|
|
1346
|
-
|
|
1347
|
-
def _load_function_md(self) -> Dict[str, SchemaObject]:
|
|
1348
|
-
"""
|
|
1349
|
-
Loads Function metadata. Doesn't load the actual callable, which can be large and is only done on-demand by the
|
|
1350
|
-
FunctionRegistry.
|
|
1351
|
-
"""
|
|
1352
|
-
result: Dict[str, SchemaObject] = {}
|
|
1353
|
-
with orm.Session(Env.get().engine) as session:
|
|
1354
|
-
# load all reachable (= mutable) tables
|
|
1355
|
-
q = session.query(store.Function.id, store.Function.dir_id, store.Function.name, store.Dir.path) \
|
|
1356
|
-
.join(store.Dir) \
|
|
1357
|
-
.where(store.Function.db_id == self.id)
|
|
1358
|
-
for id, dir_id, name, dir_path in q.all():
|
|
1359
|
-
named_fn = NamedFunction(id, dir_id, name)
|
|
1360
|
-
path = Path(dir_path, empty_is_valid=True).append(name)
|
|
1361
|
-
result[str(path)] = named_fn
|
|
1362
|
-
return result
|
|
1363
|
-
|
|
1364
|
-
def __str__(self) -> str:
|
|
1365
|
-
return self.name
|
|
1366
|
-
|
|
1367
|
-
def __repr__(self) -> str:
|
|
1368
|
-
return f'Db(name={self.name})'
|
|
1369
|
-
|
|
1370
|
-
@classmethod
|
|
1371
|
-
def create(cls, name: str) -> 'Db':
|
|
1372
|
-
db_id: int = -1
|
|
1373
|
-
with orm.Session(Env.get().engine) as session:
|
|
1374
|
-
# check for duplicate name
|
|
1375
|
-
is_duplicate = session.query(sql.func.count(store.Db.id)).where(store.Db.name == name).scalar() > 0
|
|
1376
|
-
if is_duplicate:
|
|
1377
|
-
raise exc.DuplicateNameError(f"Db '{name}' already exists")
|
|
1378
|
-
|
|
1379
|
-
db_record = store.Db(name=name)
|
|
1380
|
-
session.add(db_record)
|
|
1381
|
-
session.flush()
|
|
1382
|
-
assert db_record.id is not None
|
|
1383
|
-
db_id = db_record.id
|
|
1384
|
-
# also create a top-level directory, so that every schema object has a directory
|
|
1385
|
-
dir_record = store.Dir(db_id=db_id, path='', is_snapshot=False)
|
|
1386
|
-
session.add(dir_record)
|
|
1387
|
-
session.flush()
|
|
1388
|
-
session.commit()
|
|
1389
|
-
assert db_id is not None
|
|
1390
|
-
return Db(db_id, name)
|
|
1391
|
-
|
|
1392
|
-
@classmethod
|
|
1393
|
-
def load(cls, name: str) -> 'Db':
|
|
1394
|
-
if re.fullmatch(_ID_RE, name) is None:
|
|
1395
|
-
raise exc.BadFormatError(f"Invalid db name: '{name}'")
|
|
1396
|
-
with orm.Session(Env.get().engine) as session:
|
|
1397
|
-
try:
|
|
1398
|
-
db_record = session.query(store.Db).where(store.Db.name == name).one()
|
|
1399
|
-
return Db(db_record.id, db_record.name)
|
|
1400
|
-
except sql.exc.NoResultFound:
|
|
1401
|
-
raise exc.UnknownEntityError(f'Db {name}')
|
|
1402
|
-
|
|
1403
|
-
def delete(self) -> None:
|
|
1404
|
-
"""
|
|
1405
|
-
Delete db and all associated data.
|
|
1406
|
-
"""
|
|
1407
|
-
with Env.get().engine.begin() as conn:
|
|
1408
|
-
conn.execute(sql.delete(store.TableSnapshot.__table__).where(store.TableSnapshot.db_id == self.id))
|
|
1409
|
-
tbls_stmt = sql.select(store.Table.id).where(store.Table.db_id == self.id)
|
|
1410
|
-
conn.execute(sql.delete(store.SchemaColumn.__table__).where(store.SchemaColumn.tbl_id.in_(tbls_stmt)))
|
|
1411
|
-
conn.execute(sql.delete(store.StorageColumn.__table__).where(store.StorageColumn.tbl_id.in_(tbls_stmt)))
|
|
1412
|
-
conn.execute(
|
|
1413
|
-
sql.delete(store.TableSchemaVersion.__table__).where(store.TableSchemaVersion.tbl_id.in_(tbls_stmt)))
|
|
1414
|
-
conn.execute(sql.delete(store.Table.__table__).where(store.Table.db_id == self.id))
|
|
1415
|
-
conn.execute(sql.delete(store.Function.__table__).where(store.Function.db_id == self.id))
|
|
1416
|
-
conn.execute(sql.delete(store.Dir.__table__).where(store.Dir.db_id == self.id))
|
|
1417
|
-
conn.execute(sql.delete(store.Db.__table__).where(store.Db.id == self.id))
|
|
1418
|
-
# delete all data tables
|
|
1419
|
-
# TODO: also deleted generated images
|
|
1420
|
-
for tbl in self.paths.get(MutableTable):
|
|
1421
|
-
tbl.sa_md.drop_all(bind=conn)
|