pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +296 -105
- pixeltable/catalog/column.py +10 -8
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/insertable_table.py +25 -20
- pixeltable/catalog/schema_object.py +3 -6
- pixeltable/catalog/table.py +261 -189
- pixeltable/catalog/table_version.py +333 -202
- pixeltable/catalog/table_version_handle.py +15 -2
- pixeltable/catalog/table_version_path.py +60 -14
- pixeltable/catalog/view.py +38 -6
- pixeltable/dataframe.py +196 -18
- pixeltable/env.py +4 -4
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +4 -1
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/sql_node.py +171 -22
- pixeltable/exprs/column_property_ref.py +15 -6
- pixeltable/exprs/column_ref.py +32 -11
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/data_row.py +5 -3
- pixeltable/exprs/expr.py +7 -0
- pixeltable/exprs/literal.py +2 -0
- pixeltable/exprs/row_builder.py +4 -6
- pixeltable/exprs/rowid_ref.py +8 -0
- pixeltable/exprs/similarity_expr.py +1 -0
- pixeltable/func/query_template_function.py +1 -1
- pixeltable/func/tools.py +1 -1
- pixeltable/functions/gemini.py +0 -1
- pixeltable/functions/string.py +212 -58
- pixeltable/globals.py +12 -4
- pixeltable/index/base.py +5 -0
- pixeltable/index/btree.py +5 -0
- pixeltable/index/embedding_index.py +5 -0
- pixeltable/io/external_store.py +8 -29
- pixeltable/io/label_studio.py +1 -1
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +0 -31
- pixeltable/metadata/__init__.py +11 -2
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_30.py +6 -11
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/util.py +3 -9
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +8 -1
- pixeltable/plan.py +221 -14
- pixeltable/share/packager.py +137 -13
- pixeltable/share/publish.py +2 -2
- pixeltable/store.py +19 -13
- pixeltable/utils/dbms.py +1 -1
- pixeltable/utils/formatter.py +64 -42
- pixeltable/utils/sample.py +25 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA +2 -1
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/RECORD +58 -55
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/entry_points.txt +0 -0
|
@@ -41,8 +41,21 @@ class TableVersionHandle:
|
|
|
41
41
|
def get(self) -> TableVersion:
|
|
42
42
|
from .catalog import Catalog
|
|
43
43
|
|
|
44
|
-
|
|
45
|
-
|
|
44
|
+
cat = Catalog.get()
|
|
45
|
+
if self._tbl_version is None or not self._tbl_version.is_validated:
|
|
46
|
+
if self.effective_version is not None and self._tbl_version is not None:
|
|
47
|
+
# this is a snapshot version; we need to make sure we refer to the instance cached
|
|
48
|
+
# in Catalog, in order to avoid mixing sa_tbl instances in the same transaction
|
|
49
|
+
# (which will lead to duplicates in the From clause generated in SqlNode.create_from_clause())
|
|
50
|
+
assert (self.id, self.effective_version) in cat._tbl_versions
|
|
51
|
+
self._tbl_version = cat._tbl_versions[self.id, self.effective_version]
|
|
52
|
+
self._tbl_version.is_validated = True
|
|
53
|
+
else:
|
|
54
|
+
self._tbl_version = Catalog.get().get_tbl_version(self.id, self.effective_version)
|
|
55
|
+
if self.effective_version is None:
|
|
56
|
+
# make sure we don't see a discarded instance of a live TableVersion
|
|
57
|
+
tvs = list(Catalog.get()._tbl_versions.values())
|
|
58
|
+
assert self._tbl_version in tvs
|
|
46
59
|
return self._tbl_version
|
|
47
60
|
|
|
48
61
|
def as_dict(self) -> dict:
|
|
@@ -4,9 +4,11 @@ import logging
|
|
|
4
4
|
from typing import Optional
|
|
5
5
|
from uuid import UUID
|
|
6
6
|
|
|
7
|
+
from pixeltable.env import Env
|
|
7
8
|
from pixeltable.metadata import schema
|
|
8
9
|
|
|
9
10
|
from .column import Column
|
|
11
|
+
from .table_version import TableVersion
|
|
10
12
|
from .table_version_handle import TableVersionHandle
|
|
11
13
|
|
|
12
14
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -22,15 +24,28 @@ class TableVersionPath:
|
|
|
22
24
|
|
|
23
25
|
TableVersionPath contains all metadata needed to execute queries and updates against a particular version of a
|
|
24
26
|
table/view.
|
|
27
|
+
|
|
28
|
+
TableVersionPath supplies metadata needed for query construction (eg, column names), for which it uses a
|
|
29
|
+
cached TableVersion instance.
|
|
30
|
+
- when running inside a transaction, this instance is guaranteed to be validated
|
|
31
|
+
- when running outside a transaction, we use an unvalidated instance in order to avoid repeated validation
|
|
32
|
+
on every metadata-related method call (the instance won't stay validated, because TableVersionHandle.get()
|
|
33
|
+
runs a local transaction, at the end of which the instance is again invalidated)
|
|
34
|
+
- supplying metadata from an unvalidated instance is okay, because it needs to get revalidated anyway when a
|
|
35
|
+
query actually runs (at which point there is a transaction context) - there is no guarantee that in between
|
|
36
|
+
constructing a DataFrame and executing it, the underlying table schema hasn't changed (eg, a concurrent process
|
|
37
|
+
could have dropped a column referenced in the query).
|
|
25
38
|
"""
|
|
26
39
|
|
|
27
40
|
tbl_version: TableVersionHandle
|
|
28
41
|
base: Optional[TableVersionPath]
|
|
42
|
+
_cached_tbl_version: Optional[TableVersion]
|
|
29
43
|
|
|
30
44
|
def __init__(self, tbl_version: TableVersionHandle, base: Optional[TableVersionPath] = None):
|
|
31
45
|
assert tbl_version is not None
|
|
32
46
|
self.tbl_version = tbl_version
|
|
33
47
|
self.base = base
|
|
48
|
+
self._cached_tbl_version = None
|
|
34
49
|
|
|
35
50
|
@classmethod
|
|
36
51
|
def from_md(cls, path: schema.TableVersionPath) -> TableVersionPath:
|
|
@@ -47,17 +62,40 @@ class TableVersionPath:
|
|
|
47
62
|
result.extend(self.base.as_md())
|
|
48
63
|
return result
|
|
49
64
|
|
|
65
|
+
def refresh_cached_md(self) -> None:
|
|
66
|
+
from pixeltable.catalog import Catalog
|
|
67
|
+
|
|
68
|
+
if Env.get().in_xact:
|
|
69
|
+
# when we're running inside a transaction, we need to make sure to supply current metadata;
|
|
70
|
+
# mixing stale metadata with current metadata leads to query construction failures
|
|
71
|
+
# (multiple sqlalchemy Table instances for the same underlying table create corrupted From clauses)
|
|
72
|
+
if self._cached_tbl_version is not None and self._cached_tbl_version.is_validated:
|
|
73
|
+
# nothing to refresh
|
|
74
|
+
return
|
|
75
|
+
elif self._cached_tbl_version is not None:
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
with Catalog.get().begin_xact(for_write=False):
|
|
79
|
+
self._cached_tbl_version = self.tbl_version.get()
|
|
80
|
+
|
|
81
|
+
def clear_cached_md(self) -> None:
|
|
82
|
+
self._cached_tbl_version = None
|
|
83
|
+
if self.base is not None:
|
|
84
|
+
self.base.clear_cached_md()
|
|
85
|
+
|
|
50
86
|
def tbl_id(self) -> UUID:
|
|
51
87
|
"""Return the id of the table/view that this path represents"""
|
|
52
88
|
return self.tbl_version.id
|
|
53
89
|
|
|
54
90
|
def version(self) -> int:
|
|
55
91
|
"""Return the version of the table/view that this path represents"""
|
|
56
|
-
|
|
92
|
+
self.refresh_cached_md()
|
|
93
|
+
return self._cached_tbl_version.version
|
|
57
94
|
|
|
58
95
|
def tbl_name(self) -> str:
|
|
59
96
|
"""Return the name of the table/view that this path represents"""
|
|
60
|
-
|
|
97
|
+
self.refresh_cached_md()
|
|
98
|
+
return self._cached_tbl_version.name
|
|
61
99
|
|
|
62
100
|
def path_len(self) -> int:
|
|
63
101
|
"""Return the length of the path"""
|
|
@@ -65,18 +103,22 @@ class TableVersionPath:
|
|
|
65
103
|
|
|
66
104
|
def is_snapshot(self) -> bool:
|
|
67
105
|
"""Return True if this is a path of snapshot versions"""
|
|
68
|
-
|
|
106
|
+
self.refresh_cached_md()
|
|
107
|
+
if not self._cached_tbl_version.is_snapshot:
|
|
69
108
|
return False
|
|
70
109
|
return self.base.is_snapshot() if self.base is not None else True
|
|
71
110
|
|
|
72
111
|
def is_view(self) -> bool:
|
|
73
|
-
|
|
112
|
+
self.refresh_cached_md()
|
|
113
|
+
return self._cached_tbl_version.is_view
|
|
74
114
|
|
|
75
115
|
def is_component_view(self) -> bool:
|
|
76
|
-
|
|
116
|
+
self.refresh_cached_md()
|
|
117
|
+
return self._cached_tbl_version.is_component_view
|
|
77
118
|
|
|
78
119
|
def is_insertable(self) -> bool:
|
|
79
|
-
|
|
120
|
+
self.refresh_cached_md()
|
|
121
|
+
return self._cached_tbl_version.is_insertable
|
|
80
122
|
|
|
81
123
|
def get_tbl_versions(self) -> list[TableVersionHandle]:
|
|
82
124
|
"""Return all tbl versions"""
|
|
@@ -100,11 +142,12 @@ class TableVersionPath:
|
|
|
100
142
|
|
|
101
143
|
def columns(self) -> list[Column]:
|
|
102
144
|
"""Return all user columns visible in this tbl version path, including columns from bases"""
|
|
103
|
-
|
|
104
|
-
|
|
145
|
+
self.refresh_cached_md()
|
|
146
|
+
result = list(self._cached_tbl_version.cols_by_name.values())
|
|
147
|
+
if self.base is not None and self._cached_tbl_version.include_base_columns:
|
|
105
148
|
base_cols = self.base.columns()
|
|
106
149
|
# we only include base columns that don't conflict with one of our column names
|
|
107
|
-
result.extend(c for c in base_cols if c.name not in self.
|
|
150
|
+
result.extend(c for c in base_cols if c.name not in self._cached_tbl_version.cols_by_name)
|
|
108
151
|
return result
|
|
109
152
|
|
|
110
153
|
def cols_by_name(self) -> dict[str, Column]:
|
|
@@ -119,19 +162,21 @@ class TableVersionPath:
|
|
|
119
162
|
|
|
120
163
|
def get_column(self, name: str, include_bases: Optional[bool] = None) -> Optional[Column]:
|
|
121
164
|
"""Return the column with the given name, or None if not found"""
|
|
122
|
-
|
|
165
|
+
self.refresh_cached_md()
|
|
166
|
+
col = self._cached_tbl_version.cols_by_name.get(name)
|
|
123
167
|
if col is not None:
|
|
124
168
|
return col
|
|
125
|
-
elif self.base is not None and (include_bases or self.
|
|
169
|
+
elif self.base is not None and (include_bases or self._cached_tbl_version.include_base_columns):
|
|
126
170
|
return self.base.get_column(name)
|
|
127
171
|
else:
|
|
128
172
|
return None
|
|
129
173
|
|
|
130
174
|
def get_column_by_id(self, tbl_id: UUID, col_id: int) -> Optional[Column]:
|
|
131
175
|
"""Return the column for the given tbl/col id"""
|
|
176
|
+
self.refresh_cached_md()
|
|
132
177
|
if self.tbl_version.id == tbl_id:
|
|
133
|
-
assert col_id in self.
|
|
134
|
-
return self.
|
|
178
|
+
assert col_id in self._cached_tbl_version.cols_by_id
|
|
179
|
+
return self._cached_tbl_version.cols_by_id[col_id]
|
|
135
180
|
elif self.base is not None:
|
|
136
181
|
return self.base.get_column_by_id(tbl_id, col_id)
|
|
137
182
|
else:
|
|
@@ -139,11 +184,12 @@ class TableVersionPath:
|
|
|
139
184
|
|
|
140
185
|
def has_column(self, col: Column, include_bases: bool = True) -> bool:
|
|
141
186
|
"""Return True if this table has the given column."""
|
|
187
|
+
self.refresh_cached_md()
|
|
142
188
|
assert col.tbl is not None
|
|
143
189
|
if (
|
|
144
190
|
col.tbl.id == self.tbl_version.id
|
|
145
191
|
and col.tbl.effective_version == self.tbl_version.effective_version
|
|
146
|
-
and col.id in self.
|
|
192
|
+
and col.id in self._cached_tbl_version.cols_by_id
|
|
147
193
|
):
|
|
148
194
|
# the column is visible in this table version
|
|
149
195
|
return True
|
pixeltable/catalog/view.py
CHANGED
|
@@ -12,6 +12,10 @@ from pixeltable import catalog, exprs, func
|
|
|
12
12
|
from pixeltable.env import Env
|
|
13
13
|
from pixeltable.iterators import ComponentIterator
|
|
14
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from pixeltable.plan import SampleClause
|
|
17
|
+
|
|
18
|
+
|
|
15
19
|
from .column import Column
|
|
16
20
|
from .globals import _POS_COLUMN_NAME, MediaValidation, UpdateStatus
|
|
17
21
|
from .table import Table
|
|
@@ -66,6 +70,7 @@ class View(Table):
|
|
|
66
70
|
select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
|
|
67
71
|
additional_columns: dict[str, Any],
|
|
68
72
|
predicate: Optional['exprs.Expr'],
|
|
73
|
+
sample_clause: Optional['SampleClause'],
|
|
69
74
|
is_snapshot: bool,
|
|
70
75
|
num_retained_versions: int,
|
|
71
76
|
comment: str,
|
|
@@ -73,6 +78,8 @@ class View(Table):
|
|
|
73
78
|
iterator_cls: Optional[type[ComponentIterator]],
|
|
74
79
|
iterator_args: Optional[dict],
|
|
75
80
|
) -> View:
|
|
81
|
+
from pixeltable.plan import SampleClause
|
|
82
|
+
|
|
76
83
|
# Convert select_list to more additional_columns if present
|
|
77
84
|
include_base_columns: bool = select_list is None
|
|
78
85
|
select_list_columns: List[Column] = []
|
|
@@ -84,12 +91,23 @@ class View(Table):
|
|
|
84
91
|
columns = select_list_columns + columns_from_additional_columns
|
|
85
92
|
cls._verify_schema(columns)
|
|
86
93
|
|
|
87
|
-
# verify that
|
|
94
|
+
# verify that filters can be evaluated in the context of the base
|
|
88
95
|
if predicate is not None:
|
|
89
96
|
if not predicate.is_bound_by([base]):
|
|
90
97
|
raise excs.Error(f'Filter cannot be computed in the context of the base {base.tbl_name()}')
|
|
91
98
|
# create a copy that we can modify and store
|
|
92
99
|
predicate = predicate.copy()
|
|
100
|
+
if sample_clause is not None:
|
|
101
|
+
# make sure that the sample clause can be computed in the context of the base
|
|
102
|
+
if sample_clause.stratify_exprs is not None and not all(
|
|
103
|
+
stratify_expr.is_bound_by([base]) for stratify_expr in sample_clause.stratify_exprs
|
|
104
|
+
):
|
|
105
|
+
raise excs.Error(f'Sample clause cannot be computed in the context of the base {base.tbl_name()}')
|
|
106
|
+
# create a copy that we can modify and store
|
|
107
|
+
sc = sample_clause
|
|
108
|
+
sample_clause = SampleClause(
|
|
109
|
+
sc.version, sc.n, sc.n_per_stratum, sc.fraction, sc.seed, sc.stratify_exprs.copy()
|
|
110
|
+
)
|
|
93
111
|
|
|
94
112
|
# same for value exprs
|
|
95
113
|
for col in columns:
|
|
@@ -160,6 +178,8 @@ class View(Table):
|
|
|
160
178
|
# if this is a snapshot, we need to retarget all exprs to the snapshot tbl versions
|
|
161
179
|
if is_snapshot:
|
|
162
180
|
predicate = predicate.retarget(base_version_path) if predicate is not None else None
|
|
181
|
+
if sample_clause is not None:
|
|
182
|
+
exprs.Expr.retarget_list(sample_clause.stratify_exprs, base_version_path)
|
|
163
183
|
iterator_args_expr = (
|
|
164
184
|
iterator_args_expr.retarget(base_version_path) if iterator_args_expr is not None else None
|
|
165
185
|
)
|
|
@@ -171,6 +191,7 @@ class View(Table):
|
|
|
171
191
|
is_snapshot=is_snapshot,
|
|
172
192
|
include_base_columns=include_base_columns,
|
|
173
193
|
predicate=predicate.as_dict() if predicate is not None else None,
|
|
194
|
+
sample_clause=sample_clause.as_dict() if sample_clause is not None else None,
|
|
174
195
|
base_versions=base_version_path.as_md(),
|
|
175
196
|
iterator_class_fqn=iterator_class_fqn,
|
|
176
197
|
iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
|
|
@@ -204,8 +225,17 @@ class View(Table):
|
|
|
204
225
|
|
|
205
226
|
from pixeltable.plan import Planner
|
|
206
227
|
|
|
207
|
-
|
|
208
|
-
|
|
228
|
+
try:
|
|
229
|
+
plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
|
|
230
|
+
num_rows, num_excs, _ = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
|
|
231
|
+
except:
|
|
232
|
+
# we need to remove the orphaned TableVersion instance
|
|
233
|
+
del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
|
|
234
|
+
base_tbl_version = base.tbl_version.get()
|
|
235
|
+
if tbl_version.effective_version is None and not base_tbl_version.is_snapshot:
|
|
236
|
+
# also remove tbl_version from the base
|
|
237
|
+
base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
|
|
238
|
+
raise
|
|
209
239
|
Env.get().console_logger.info(f'Created view `{name}` with {num_rows} rows, {num_excs} exceptions.')
|
|
210
240
|
|
|
211
241
|
session.commit()
|
|
@@ -285,16 +315,18 @@ class View(Table):
|
|
|
285
315
|
|
|
286
316
|
def _table_descriptor(self) -> str:
|
|
287
317
|
display_name = 'Snapshot' if self._snapshot_only else 'View'
|
|
288
|
-
result = [f'{display_name} {self._path!r}']
|
|
318
|
+
result = [f'{display_name} {self._path()!r}']
|
|
289
319
|
bases_descrs: list[str] = []
|
|
290
320
|
for base, effective_version in zip(self._base_tables, self._effective_base_versions):
|
|
291
321
|
if effective_version is None:
|
|
292
|
-
bases_descrs.append(f'{base._path!r}')
|
|
322
|
+
bases_descrs.append(f'{base._path()!r}')
|
|
293
323
|
else:
|
|
294
|
-
base_descr = f'{base._path}:{effective_version}'
|
|
324
|
+
base_descr = f'{base._path()}:{effective_version}'
|
|
295
325
|
bases_descrs.append(f'{base_descr!r}')
|
|
296
326
|
result.append(f' (of {", ".join(bases_descrs)})')
|
|
297
327
|
|
|
298
328
|
if self._tbl_version.get().predicate is not None:
|
|
299
329
|
result.append(f'\nWhere: {self._tbl_version.get().predicate!s}')
|
|
330
|
+
if self._tbl_version.get().sample_clause is not None:
|
|
331
|
+
result.append(f'\nSample: {self._tbl_version.get().sample_clause!s}')
|
|
300
332
|
return ''.join(result)
|