pixeltable 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +619 -255
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/insertable_table.py +9 -9
- pixeltable/catalog/path.py +59 -20
- pixeltable/catalog/schema_object.py +10 -4
- pixeltable/catalog/table.py +51 -53
- pixeltable/catalog/table_version.py +216 -156
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/tbl_ops.py +44 -0
- pixeltable/catalog/view.py +63 -65
- pixeltable/config.py +12 -4
- pixeltable/dataframe.py +75 -6
- pixeltable/env.py +46 -17
- pixeltable/exec/aggregation_node.py +1 -1
- pixeltable/exec/cache_prefetch_node.py +2 -6
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +10 -51
- pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
- pixeltable/exec/in_memory_data_node.py +17 -16
- pixeltable/exec/sql_node.py +6 -7
- pixeltable/exprs/column_ref.py +2 -1
- pixeltable/exprs/data_row.py +13 -13
- pixeltable/exprs/row_builder.py +16 -4
- pixeltable/exprs/string_op.py +1 -1
- pixeltable/func/expr_template_function.py +1 -4
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/gemini.py +4 -4
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/openai.py +9 -6
- pixeltable/functions/timestamp.py +6 -6
- pixeltable/functions/video.py +2 -6
- pixeltable/globals.py +62 -33
- pixeltable/io/datarows.py +2 -1
- pixeltable/io/pandas.py +1 -0
- pixeltable/io/table_data_conduit.py +12 -13
- pixeltable/iterators/audio.py +17 -8
- pixeltable/iterators/image.py +5 -2
- pixeltable/metadata/schema.py +39 -2
- pixeltable/plan.py +5 -14
- pixeltable/share/packager.py +13 -13
- pixeltable/store.py +31 -7
- pixeltable/type_system.py +2 -1
- pixeltable/utils/filecache.py +1 -1
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/media_store.py +90 -34
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/METADATA +1 -1
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/RECORD +52 -51
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/entry_points.txt +0 -0
|
@@ -76,7 +76,7 @@ class TableVersionPath:
|
|
|
76
76
|
elif self._cached_tbl_version is not None:
|
|
77
77
|
return
|
|
78
78
|
|
|
79
|
-
with Catalog.get().begin_xact(for_write=False):
|
|
79
|
+
with Catalog.get().begin_xact(tbl_id=self.tbl_version.id, for_write=False):
|
|
80
80
|
self._cached_tbl_version = self.tbl_version.get()
|
|
81
81
|
|
|
82
82
|
def clear_cached_md(self) -> None:
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# This file contains all dataclasses related to schema.PendingTableOp:
|
|
2
|
+
# - TableOp: the container for each log entry
|
|
3
|
+
# - <>Op: the actual operation, which is performed by TableVersion.exec_op(); each <>Op class contains
|
|
4
|
+
# enough information for exec_op() to perform the operation without having to reference data outside of
|
|
5
|
+
# TableVersion
|
|
6
|
+
|
|
7
|
+
import dataclasses
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclasses.dataclass
|
|
12
|
+
class CreateStoreTableOp:
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclasses.dataclass
|
|
17
|
+
class LoadViewOp:
|
|
18
|
+
view_path: dict[str, Any] # needed to create the view load plan
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclasses.dataclass
|
|
22
|
+
class DeleteTableMdOp:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclasses.dataclass
|
|
27
|
+
class DeleteTableMediaFilesOp:
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclasses.dataclass
|
|
32
|
+
class DropStoreTableOp:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclasses.dataclass
|
|
37
|
+
class TableOp:
|
|
38
|
+
tbl_id: str # uuid.UUID
|
|
39
|
+
op_sn: int # sequence number within the update operation; [0, num_ops)
|
|
40
|
+
num_ops: int # total number of ops forming the update operation
|
|
41
|
+
needs_xact: bool # if True, op must be run as part of a transaction
|
|
42
|
+
|
|
43
|
+
create_store_table_op: Optional[CreateStoreTableOp] = None
|
|
44
|
+
load_view_op: Optional[LoadViewOp] = None
|
pixeltable/catalog/view.py
CHANGED
|
@@ -9,7 +9,6 @@ import pixeltable.exceptions as excs
|
|
|
9
9
|
import pixeltable.metadata.schema as md_schema
|
|
10
10
|
import pixeltable.type_system as ts
|
|
11
11
|
from pixeltable import catalog, exprs, func
|
|
12
|
-
from pixeltable.env import Env
|
|
13
12
|
from pixeltable.iterators import ComponentIterator
|
|
14
13
|
|
|
15
14
|
if TYPE_CHECKING:
|
|
@@ -19,9 +18,10 @@ if TYPE_CHECKING:
|
|
|
19
18
|
from .column import Column
|
|
20
19
|
from .globals import _POS_COLUMN_NAME, MediaValidation
|
|
21
20
|
from .table import Table
|
|
22
|
-
from .table_version import TableVersion
|
|
21
|
+
from .table_version import TableVersion, TableVersionMd
|
|
23
22
|
from .table_version_handle import TableVersionHandle
|
|
24
23
|
from .table_version_path import TableVersionPath
|
|
24
|
+
from .tbl_ops import CreateStoreTableOp, LoadViewOp, TableOp
|
|
25
25
|
from .update_status import UpdateStatus
|
|
26
26
|
|
|
27
27
|
if TYPE_CHECKING:
|
|
@@ -45,9 +45,18 @@ class View(Table):
|
|
|
45
45
|
if not snapshot_only:
|
|
46
46
|
self._tbl_version = tbl_version_path.tbl_version
|
|
47
47
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
def _display_name(self) -> str:
|
|
49
|
+
name: str
|
|
50
|
+
if self._tbl_version_path.is_snapshot():
|
|
51
|
+
name = 'snapshot'
|
|
52
|
+
elif self._tbl_version_path.is_view():
|
|
53
|
+
name = 'view'
|
|
54
|
+
else:
|
|
55
|
+
assert self._tbl_version_path.is_replica()
|
|
56
|
+
name = 'table'
|
|
57
|
+
if self._tbl_version_path.is_replica():
|
|
58
|
+
name = f'{name}-replica'
|
|
59
|
+
return name
|
|
51
60
|
|
|
52
61
|
@classmethod
|
|
53
62
|
def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
|
|
@@ -80,7 +89,7 @@ class View(Table):
|
|
|
80
89
|
media_validation: MediaValidation,
|
|
81
90
|
iterator_cls: Optional[type[ComponentIterator]],
|
|
82
91
|
iterator_args: Optional[dict],
|
|
83
|
-
) ->
|
|
92
|
+
) -> tuple[TableVersionMd, Optional[list[TableOp]]]:
|
|
84
93
|
from pixeltable.plan import SampleClause
|
|
85
94
|
|
|
86
95
|
# Convert select_list to more additional_columns if present
|
|
@@ -167,11 +176,10 @@ class View(Table):
|
|
|
167
176
|
for col in columns:
|
|
168
177
|
if col.name in iterator_col_names:
|
|
169
178
|
raise excs.Error(
|
|
170
|
-
f'Duplicate name: column {col.name} is already present in the iterator output schema'
|
|
179
|
+
f'Duplicate name: column {col.name!r} is already present in the iterator output schema'
|
|
171
180
|
)
|
|
172
181
|
columns = iterator_cols + columns
|
|
173
182
|
|
|
174
|
-
session = Env.get().session
|
|
175
183
|
from pixeltable.exprs import InlineDict
|
|
176
184
|
|
|
177
185
|
iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
|
|
@@ -200,54 +208,26 @@ class View(Table):
|
|
|
200
208
|
iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
|
|
201
209
|
)
|
|
202
210
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
name,
|
|
206
|
-
columns,
|
|
207
|
-
num_retained_versions,
|
|
208
|
-
comment,
|
|
209
|
-
media_validation=media_validation,
|
|
210
|
-
# base_path=base_version_path,
|
|
211
|
-
view_md=view_md,
|
|
211
|
+
md = TableVersion.create_initial_md(
|
|
212
|
+
name, columns, num_retained_versions, comment, media_validation=media_validation, view_md=view_md
|
|
212
213
|
)
|
|
213
|
-
if
|
|
214
|
-
# this is purely a snapshot:
|
|
215
|
-
|
|
216
|
-
_logger.info(f'created snapshot {name}')
|
|
214
|
+
if md.tbl_md.is_pure_snapshot:
|
|
215
|
+
# this is purely a snapshot: no store table to create or load
|
|
216
|
+
return md, None
|
|
217
217
|
else:
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
name,
|
|
222
|
-
TableVersionPath(
|
|
223
|
-
TableVersionHandle(tbl_version.id, tbl_version.effective_version), base=base_version_path
|
|
224
|
-
),
|
|
225
|
-
snapshot_only=False,
|
|
226
|
-
)
|
|
227
|
-
_logger.info(f'Created view `{name}`, id={tbl_version.id}')
|
|
228
|
-
|
|
229
|
-
from pixeltable.plan import Planner
|
|
230
|
-
|
|
231
|
-
try:
|
|
232
|
-
plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
|
|
233
|
-
_, row_counts = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
|
|
234
|
-
status = UpdateStatus(row_count_stats=row_counts)
|
|
235
|
-
tbl_version._write_md_update_status(0, update_status=status)
|
|
236
|
-
|
|
237
|
-
except:
|
|
238
|
-
# we need to remove the orphaned TableVersion instance
|
|
239
|
-
del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
|
|
240
|
-
base_tbl_version = base.tbl_version.get()
|
|
241
|
-
if tbl_version.effective_version is None and not base_tbl_version.is_snapshot:
|
|
242
|
-
# also remove tbl_version from the base
|
|
243
|
-
base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
|
|
244
|
-
raise
|
|
245
|
-
Env.get().console_logger.info(
|
|
246
|
-
f'Created view `{name}` with {status.num_rows} rows, {status.num_excs} exceptions.'
|
|
218
|
+
tbl_id = md.tbl_md.tbl_id
|
|
219
|
+
view_path = TableVersionPath(
|
|
220
|
+
TableVersionHandle(UUID(tbl_id), effective_version=0 if is_snapshot else None), base=base_version_path
|
|
247
221
|
)
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
222
|
+
ops = [
|
|
223
|
+
TableOp(
|
|
224
|
+
tbl_id=tbl_id, op_sn=0, num_ops=2, needs_xact=False, create_store_table_op=CreateStoreTableOp()
|
|
225
|
+
),
|
|
226
|
+
TableOp(
|
|
227
|
+
tbl_id=tbl_id, op_sn=1, num_ops=2, needs_xact=True, load_view_op=LoadViewOp(view_path.as_dict())
|
|
228
|
+
),
|
|
229
|
+
]
|
|
230
|
+
return md, ops
|
|
251
231
|
|
|
252
232
|
@classmethod
|
|
253
233
|
def _verify_column(cls, col: Column) -> None:
|
|
@@ -275,13 +255,26 @@ class View(Table):
|
|
|
275
255
|
base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
|
|
276
256
|
)
|
|
277
257
|
|
|
258
|
+
def _is_anonymous_snapshot(self) -> bool:
|
|
259
|
+
"""
|
|
260
|
+
Returns True if this is an unnamed snapshot (i.e., a snapshot that is not a separate schema object).
|
|
261
|
+
"""
|
|
262
|
+
return self._snapshot_only and self._id == self._tbl_version_path.tbl_id
|
|
263
|
+
|
|
278
264
|
def _get_metadata(self) -> dict[str, Any]:
|
|
279
265
|
md = super()._get_metadata()
|
|
280
266
|
md['is_view'] = True
|
|
281
267
|
md['is_snapshot'] = self._tbl_version_path.is_snapshot()
|
|
268
|
+
if self._is_anonymous_snapshot():
|
|
269
|
+
# Update name and path with version qualifiers.
|
|
270
|
+
md['name'] = f'{self._name}:{self._tbl_version_path.version()}'
|
|
271
|
+
md['path'] = f'{self._path()}:{self._tbl_version_path.version()}'
|
|
282
272
|
base_tbl = self._get_base_table()
|
|
283
|
-
|
|
284
|
-
|
|
273
|
+
if base_tbl is None:
|
|
274
|
+
md['base'] = None
|
|
275
|
+
else:
|
|
276
|
+
base_version = self._effective_base_versions[0]
|
|
277
|
+
md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
|
|
285
278
|
return md
|
|
286
279
|
|
|
287
280
|
def insert(
|
|
@@ -295,28 +288,33 @@ class View(Table):
|
|
|
295
288
|
print_stats: bool = False,
|
|
296
289
|
**kwargs: Any,
|
|
297
290
|
) -> UpdateStatus:
|
|
298
|
-
raise excs.Error(f'{self.
|
|
291
|
+
raise excs.Error(f'{self._display_str()}: Cannot insert into a {self._display_name()}.')
|
|
299
292
|
|
|
300
293
|
def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
|
|
301
|
-
raise excs.Error(f'{self.
|
|
294
|
+
raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
|
|
302
295
|
|
|
303
296
|
def _get_base_table(self) -> Optional['Table']:
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
297
|
+
if self._tbl_version_path.tbl_id != self._id:
|
|
298
|
+
# _tbl_version_path represents a different schema object from this one. This can only happen if this is a
|
|
299
|
+
# named pure snapshot.
|
|
300
|
+
base_id = self._tbl_version_path.tbl_id
|
|
301
|
+
elif self._tbl_version_path.base is None:
|
|
302
|
+
return None
|
|
303
|
+
else:
|
|
304
|
+
base_id = self._tbl_version_path.base.tbl_id
|
|
305
|
+
with catalog.Catalog.get().begin_xact(tbl_id=base_id, for_write=False):
|
|
306
|
+
return catalog.Catalog.get().get_table_by_id(base_id)
|
|
308
307
|
|
|
309
308
|
@property
|
|
310
309
|
def _effective_base_versions(self) -> list[Optional[int]]:
|
|
311
310
|
effective_versions = [tv.effective_version for tv in self._tbl_version_path.get_tbl_versions()]
|
|
312
|
-
if self._snapshot_only:
|
|
313
|
-
return effective_versions
|
|
311
|
+
if self._snapshot_only and not self._is_anonymous_snapshot():
|
|
312
|
+
return effective_versions # Named pure snapshot
|
|
314
313
|
else:
|
|
315
314
|
return effective_versions[1:]
|
|
316
315
|
|
|
317
316
|
def _table_descriptor(self) -> str:
|
|
318
|
-
|
|
319
|
-
result = [f'{display_name} {self._path()!r}']
|
|
317
|
+
result = [self._display_str()]
|
|
320
318
|
bases_descrs: list[str] = []
|
|
321
319
|
for base, effective_version in zip(self._get_base_tables(), self._effective_base_versions):
|
|
322
320
|
if effective_version is None:
|
pixeltable/config.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import Any, ClassVar, Optional, TypeVar
|
|
|
8
8
|
|
|
9
9
|
import toml
|
|
10
10
|
|
|
11
|
-
from pixeltable import exceptions as excs
|
|
11
|
+
from pixeltable import env, exceptions as excs
|
|
12
12
|
|
|
13
13
|
_logger = logging.getLogger('pixeltable')
|
|
14
14
|
|
|
@@ -82,7 +82,11 @@ class Config:
|
|
|
82
82
|
return cls.__instance
|
|
83
83
|
|
|
84
84
|
@classmethod
|
|
85
|
-
def init(cls, config_overrides: dict[str, Any]) -> None:
|
|
85
|
+
def init(cls, config_overrides: dict[str, Any], reinit: bool = False) -> None:
|
|
86
|
+
if reinit:
|
|
87
|
+
cls.__instance = None
|
|
88
|
+
for cl in env._registered_clients.values():
|
|
89
|
+
cl.client_obj = None
|
|
86
90
|
if cls.__instance is None:
|
|
87
91
|
cls.__instance = cls(config_overrides)
|
|
88
92
|
elif len(config_overrides) > 0:
|
|
@@ -102,7 +106,7 @@ class Config:
|
|
|
102
106
|
env_var = f'{section.upper()}_{key.upper()}'
|
|
103
107
|
if override_var in self.__config_overrides:
|
|
104
108
|
return self.__config_overrides[override_var]
|
|
105
|
-
if env_var in os.environ:
|
|
109
|
+
if env_var in os.environ and len(os.environ[env_var]) > 0:
|
|
106
110
|
return os.environ[env_var]
|
|
107
111
|
return default
|
|
108
112
|
|
|
@@ -157,7 +161,11 @@ KNOWN_CONFIG_OPTIONS = {
|
|
|
157
161
|
'groq': {'api_key': 'Groq API key'},
|
|
158
162
|
'label_studio': {'api_key': 'Label Studio API key', 'url': 'Label Studio server URL'},
|
|
159
163
|
'mistral': {'api_key': 'Mistral API key'},
|
|
160
|
-
'openai': {
|
|
164
|
+
'openai': {
|
|
165
|
+
'api_key': 'OpenAI API key',
|
|
166
|
+
'base_url': 'OpenAI API base URL',
|
|
167
|
+
'api_version': 'API version if using Azure OpenAI',
|
|
168
|
+
},
|
|
161
169
|
'replicate': {'api_token': 'Replicate API token'},
|
|
162
170
|
'together': {'api_key': 'Together API key'},
|
|
163
171
|
'pypi': {'api_key': 'PyPI API key (for internal use only)'},
|
pixeltable/dataframe.py
CHANGED
|
@@ -8,9 +8,22 @@ import json
|
|
|
8
8
|
import logging
|
|
9
9
|
import traceback
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import
|
|
11
|
+
from typing import (
|
|
12
|
+
TYPE_CHECKING,
|
|
13
|
+
Any,
|
|
14
|
+
AsyncIterator,
|
|
15
|
+
Callable,
|
|
16
|
+
Hashable,
|
|
17
|
+
Iterator,
|
|
18
|
+
NoReturn,
|
|
19
|
+
Optional,
|
|
20
|
+
Sequence,
|
|
21
|
+
TypeVar,
|
|
22
|
+
Union,
|
|
23
|
+
)
|
|
12
24
|
|
|
13
25
|
import pandas as pd
|
|
26
|
+
import pydantic
|
|
14
27
|
import sqlalchemy as sql
|
|
15
28
|
|
|
16
29
|
from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
|
|
@@ -32,6 +45,11 @@ _logger = logging.getLogger('pixeltable')
|
|
|
32
45
|
|
|
33
46
|
|
|
34
47
|
class DataFrameResultSet:
|
|
48
|
+
_rows: list[list[Any]]
|
|
49
|
+
_col_names: list[str]
|
|
50
|
+
__schema: dict[str, ColumnType]
|
|
51
|
+
__formatter: Formatter
|
|
52
|
+
|
|
35
53
|
def __init__(self, rows: list[list[Any]], schema: dict[str, ColumnType]):
|
|
36
54
|
self._rows = rows
|
|
37
55
|
self._col_names = list(schema.keys())
|
|
@@ -66,6 +84,44 @@ class DataFrameResultSet:
|
|
|
66
84
|
def to_pandas(self) -> pd.DataFrame:
|
|
67
85
|
return pd.DataFrame.from_records(self._rows, columns=self._col_names)
|
|
68
86
|
|
|
87
|
+
BaseModelT = TypeVar('BaseModelT', bound=pydantic.BaseModel)
|
|
88
|
+
|
|
89
|
+
def to_pydantic(self, model: type[BaseModelT]) -> Iterator[BaseModelT]:
|
|
90
|
+
"""
|
|
91
|
+
Convert the DataFrameResultSet to a list of Pydantic model instances.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
model: A Pydantic model class.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
An iterator over Pydantic model instances, one for each row in the result set.
|
|
98
|
+
|
|
99
|
+
Raises:
|
|
100
|
+
Error: If the row data doesn't match the model schema.
|
|
101
|
+
"""
|
|
102
|
+
model_fields = model.model_fields
|
|
103
|
+
model_config = getattr(model, 'model_config', {})
|
|
104
|
+
forbid_extra_fields = model_config.get('extra') == 'forbid'
|
|
105
|
+
|
|
106
|
+
# schema validation
|
|
107
|
+
required_fields = {name for name, field in model_fields.items() if field.is_required()}
|
|
108
|
+
col_names = set(self._col_names)
|
|
109
|
+
missing_fields = required_fields - col_names
|
|
110
|
+
if len(missing_fields) > 0:
|
|
111
|
+
raise excs.Error(
|
|
112
|
+
f'Required model fields {missing_fields} are missing from result set columns {self._col_names}'
|
|
113
|
+
)
|
|
114
|
+
if forbid_extra_fields:
|
|
115
|
+
extra_fields = col_names - set(model_fields.keys())
|
|
116
|
+
if len(extra_fields) > 0:
|
|
117
|
+
raise excs.Error(f"Extra fields {extra_fields} are not allowed in model with extra='forbid'")
|
|
118
|
+
|
|
119
|
+
for row in self:
|
|
120
|
+
try:
|
|
121
|
+
yield model(**row)
|
|
122
|
+
except pydantic.ValidationError as e:
|
|
123
|
+
raise excs.Error(str(e)) from e
|
|
124
|
+
|
|
69
125
|
def _row_to_dict(self, row_idx: int) -> dict[str, Any]:
|
|
70
126
|
return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
|
|
71
127
|
|
|
@@ -1185,7 +1241,7 @@ class DataFrame:
|
|
|
1185
1241
|
"""
|
|
1186
1242
|
self._validate_mutable('delete', False)
|
|
1187
1243
|
if not self._first_tbl.is_insertable():
|
|
1188
|
-
raise excs.Error('Cannot delete
|
|
1244
|
+
raise excs.Error('Cannot use `delete` on a view.')
|
|
1189
1245
|
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
|
|
1190
1246
|
return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
|
|
1191
1247
|
|
|
@@ -1196,14 +1252,27 @@ class DataFrame:
|
|
|
1196
1252
|
op_name: The name of the operation for which the test is being performed.
|
|
1197
1253
|
allow_select: If True, allow a select() specification in the Dataframe.
|
|
1198
1254
|
"""
|
|
1255
|
+
self._validate_mutable_op_sequence(op_name, allow_select)
|
|
1256
|
+
|
|
1257
|
+
# TODO: Reconcile these with Table.__check_mutable()
|
|
1258
|
+
assert len(self._from_clause.tbls) == 1
|
|
1259
|
+
if self._first_tbl.is_snapshot():
|
|
1260
|
+
raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
|
|
1261
|
+
if self._first_tbl.is_replica():
|
|
1262
|
+
raise excs.Error(f'Cannot use `{op_name}` on a replica.')
|
|
1263
|
+
|
|
1264
|
+
def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
|
|
1265
|
+
"""Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
|
|
1199
1266
|
if self.group_by_clause is not None or self.grouping_tbl is not None:
|
|
1200
|
-
raise excs.Error(f'Cannot use `{op_name}` after `group_by
|
|
1267
|
+
raise excs.Error(f'Cannot use `{op_name}` after `group_by`.')
|
|
1201
1268
|
if self.order_by_clause is not None:
|
|
1202
|
-
raise excs.Error(f'Cannot use `{op_name}` after `order_by
|
|
1269
|
+
raise excs.Error(f'Cannot use `{op_name}` after `order_by`.')
|
|
1203
1270
|
if self.select_list is not None and not allow_select:
|
|
1204
|
-
raise excs.Error(f'Cannot use `{op_name}` after `select
|
|
1271
|
+
raise excs.Error(f'Cannot use `{op_name}` after `select`.')
|
|
1205
1272
|
if self.limit_val is not None:
|
|
1206
|
-
raise excs.Error(f'Cannot use `{op_name}` after `limit
|
|
1273
|
+
raise excs.Error(f'Cannot use `{op_name}` after `limit`.')
|
|
1274
|
+
if self._has_joins():
|
|
1275
|
+
raise excs.Error(f'Cannot use `{op_name}` after `join`.')
|
|
1207
1276
|
|
|
1208
1277
|
def as_dict(self) -> dict[str, Any]:
|
|
1209
1278
|
"""
|
pixeltable/env.py
CHANGED
|
@@ -13,6 +13,8 @@ import platform
|
|
|
13
13
|
import shutil
|
|
14
14
|
import sys
|
|
15
15
|
import threading
|
|
16
|
+
import types
|
|
17
|
+
import typing
|
|
16
18
|
import uuid
|
|
17
19
|
import warnings
|
|
18
20
|
from abc import abstractmethod
|
|
@@ -20,7 +22,7 @@ from contextlib import contextmanager
|
|
|
20
22
|
from dataclasses import dataclass, field
|
|
21
23
|
from pathlib import Path
|
|
22
24
|
from sys import stdout
|
|
23
|
-
from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
|
|
25
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Optional, TypeVar
|
|
24
26
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
25
27
|
|
|
26
28
|
import nest_asyncio # type: ignore[import-untyped]
|
|
@@ -86,6 +88,7 @@ class Env:
|
|
|
86
88
|
_resource_pool_info: dict[str, Any]
|
|
87
89
|
_current_conn: Optional[sql.Connection]
|
|
88
90
|
_current_session: Optional[sql.orm.Session]
|
|
91
|
+
_current_isolation_level: Optional[Literal['REPEATABLE_READ', 'SERIALIZABLE']]
|
|
89
92
|
_dbms: Optional[Dbms]
|
|
90
93
|
_event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
|
|
91
94
|
|
|
@@ -99,6 +102,7 @@ class Env:
|
|
|
99
102
|
def _init_env(cls, reinit_db: bool = False) -> None:
|
|
100
103
|
assert not cls.__initializing, 'Circular env initialization detected.'
|
|
101
104
|
cls.__initializing = True
|
|
105
|
+
cls._instance = None
|
|
102
106
|
env = Env()
|
|
103
107
|
env._set_up(reinit_db=reinit_db)
|
|
104
108
|
env._upgrade_metadata()
|
|
@@ -142,6 +146,7 @@ class Env:
|
|
|
142
146
|
self._resource_pool_info = {}
|
|
143
147
|
self._current_conn = None
|
|
144
148
|
self._current_session = None
|
|
149
|
+
self._current_isolation_level = None
|
|
145
150
|
self._dbms = None
|
|
146
151
|
self._event_loop = None
|
|
147
152
|
|
|
@@ -230,20 +235,34 @@ class Env:
|
|
|
230
235
|
return self._db_server is not None
|
|
231
236
|
|
|
232
237
|
@contextmanager
|
|
233
|
-
def begin_xact(self) -> Iterator[sql.Connection]:
|
|
234
|
-
"""
|
|
238
|
+
def begin_xact(self, for_write: bool = False) -> Iterator[sql.Connection]:
|
|
239
|
+
"""
|
|
240
|
+
Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
|
|
241
|
+
|
|
242
|
+
for_write: if True, uses serializable isolation; if False, uses repeatable_read
|
|
243
|
+
|
|
244
|
+
TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
|
|
245
|
+
that avoids tripping over any pending ops
|
|
246
|
+
"""
|
|
235
247
|
if self._current_conn is None:
|
|
236
248
|
assert self._current_session is None
|
|
237
249
|
try:
|
|
238
|
-
|
|
250
|
+
self._current_isolation_level = 'SERIALIZABLE' if for_write else 'REPEATABLE_READ'
|
|
251
|
+
with (
|
|
252
|
+
self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
|
|
253
|
+
sql.orm.Session(conn) as session,
|
|
254
|
+
conn.begin(),
|
|
255
|
+
):
|
|
239
256
|
self._current_conn = conn
|
|
240
257
|
self._current_session = session
|
|
241
258
|
yield conn
|
|
242
259
|
finally:
|
|
243
260
|
self._current_session = None
|
|
244
261
|
self._current_conn = None
|
|
262
|
+
self._current_isolation_level = None
|
|
245
263
|
else:
|
|
246
264
|
assert self._current_session is not None
|
|
265
|
+
assert for_write == (self._current_isolation_level == 'serializable')
|
|
247
266
|
yield self._current_conn
|
|
248
267
|
|
|
249
268
|
def configure_logging(
|
|
@@ -587,16 +606,26 @@ class Env:
|
|
|
587
606
|
|
|
588
607
|
# Construct a client, retrieving each parameter from config.
|
|
589
608
|
|
|
590
|
-
init_kwargs: dict[str,
|
|
591
|
-
for param in cl.
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
609
|
+
init_kwargs: dict[str, Any] = {}
|
|
610
|
+
for param in cl.params.values():
|
|
611
|
+
# Determine the type of the parameter for proper config parsing.
|
|
612
|
+
t = param.annotation
|
|
613
|
+
# Deference Optional[T]
|
|
614
|
+
if typing.get_origin(t) in (typing.Union, types.UnionType):
|
|
615
|
+
args = typing.get_args(t)
|
|
616
|
+
if args[0] is type(None):
|
|
617
|
+
t = args[1]
|
|
618
|
+
elif args[1] is type(None):
|
|
619
|
+
t = args[0]
|
|
620
|
+
assert isinstance(t, type), t
|
|
621
|
+
arg: Any = Config.get().get_value(param.name, t, section=name)
|
|
622
|
+
if arg is not None:
|
|
623
|
+
init_kwargs[param.name] = arg
|
|
624
|
+
elif param.default is inspect.Parameter.empty:
|
|
596
625
|
raise excs.Error(
|
|
597
|
-
f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
|
|
598
|
-
f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, '
|
|
599
|
-
f'or put `{param.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
626
|
+
f'`{name}` client not initialized: parameter `{param.name}` is not configured.\n'
|
|
627
|
+
f'To fix this, specify the `{name.upper()}_{param.name.upper()}` environment variable, '
|
|
628
|
+
f'or put `{param.name.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
600
629
|
)
|
|
601
630
|
|
|
602
631
|
cl.client_obj = cl.init_fn(**init_kwargs)
|
|
@@ -607,7 +636,7 @@ class Env:
|
|
|
607
636
|
"""
|
|
608
637
|
The http server root is the file system root.
|
|
609
638
|
eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
|
|
610
|
-
|
|
639
|
+
On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
|
|
611
640
|
This arrangement enables serving media hosted within _home,
|
|
612
641
|
as well as external media inserted into pixeltable or produced by pixeltable.
|
|
613
642
|
The port is chosen dynamically to prevent conflicts.
|
|
@@ -815,8 +844,8 @@ def register_client(name: str) -> Callable:
|
|
|
815
844
|
|
|
816
845
|
def decorator(fn: Callable) -> None:
|
|
817
846
|
sig = inspect.signature(fn)
|
|
818
|
-
|
|
819
|
-
_registered_clients[name] = ApiClient(init_fn=fn,
|
|
847
|
+
params = dict(sig.parameters)
|
|
848
|
+
_registered_clients[name] = ApiClient(init_fn=fn, params=params)
|
|
820
849
|
|
|
821
850
|
return decorator
|
|
822
851
|
|
|
@@ -827,7 +856,7 @@ _registered_clients: dict[str, ApiClient] = {}
|
|
|
827
856
|
@dataclass
|
|
828
857
|
class ApiClient:
|
|
829
858
|
init_fn: Callable
|
|
830
|
-
|
|
859
|
+
params: dict[str, inspect.Parameter]
|
|
831
860
|
client_obj: Optional[Any] = None
|
|
832
861
|
|
|
833
862
|
|
|
@@ -45,7 +45,7 @@ class AggregationNode(ExecNode):
|
|
|
45
45
|
# we need to make sure to refer to the same exprs that RowBuilder.eval() will use
|
|
46
46
|
self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
|
|
47
47
|
# create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
|
|
48
|
-
self.output_batch = DataRowBatch(
|
|
48
|
+
self.output_batch = DataRowBatch(row_builder)
|
|
49
49
|
self.limit = None
|
|
50
50
|
|
|
51
51
|
def set_limit(self, limit: int) -> None:
|
|
@@ -12,7 +12,7 @@ from pathlib import Path
|
|
|
12
12
|
from typing import Any, AsyncIterator, Iterator, Optional
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
|
-
from pixeltable import
|
|
15
|
+
from pixeltable import env, exceptions as excs, exprs
|
|
16
16
|
from pixeltable.utils.filecache import FileCache
|
|
17
17
|
|
|
18
18
|
from .data_row_batch import DataRowBatch
|
|
@@ -37,7 +37,6 @@ class CachePrefetchNode(ExecNode):
|
|
|
37
37
|
boto_client_lock: threading.Lock
|
|
38
38
|
|
|
39
39
|
# execution state
|
|
40
|
-
batch_tbl_version: Optional[catalog.TableVersionHandle] # needed to construct output batches
|
|
41
40
|
num_returned_rows: int
|
|
42
41
|
|
|
43
42
|
# ready_rows: rows that are ready to be returned, ordered by row idx;
|
|
@@ -68,7 +67,6 @@ class CachePrefetchNode(ExecNode):
|
|
|
68
67
|
self.boto_client = None
|
|
69
68
|
self.boto_client_lock = threading.Lock()
|
|
70
69
|
|
|
71
|
-
self.batch_tbl_version = None
|
|
72
70
|
self.num_returned_rows = 0
|
|
73
71
|
self.ready_rows = deque()
|
|
74
72
|
self.in_flight_rows = {}
|
|
@@ -95,7 +93,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
95
93
|
|
|
96
94
|
if len(self.ready_rows) > 0:
|
|
97
95
|
# create DataRowBatch from the first BATCH_SIZE ready rows
|
|
98
|
-
batch = DataRowBatch(self.
|
|
96
|
+
batch = DataRowBatch(self.row_builder)
|
|
99
97
|
rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
|
|
100
98
|
for row in rows:
|
|
101
99
|
assert row is not None
|
|
@@ -173,8 +171,6 @@ class CachePrefetchNode(ExecNode):
|
|
|
173
171
|
if input_batch is None:
|
|
174
172
|
self.input_finished = True
|
|
175
173
|
return
|
|
176
|
-
if self.batch_tbl_version is None:
|
|
177
|
-
self.batch_tbl_version = input_batch.tbl
|
|
178
174
|
|
|
179
175
|
file_cache = FileCache.get()
|
|
180
176
|
|
|
@@ -40,7 +40,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
43
|
-
output_batch = DataRowBatch(self.
|
|
43
|
+
output_batch = DataRowBatch(self.row_builder)
|
|
44
44
|
async for input_batch in self.input:
|
|
45
45
|
for input_row in input_batch:
|
|
46
46
|
self.row_builder.eval(input_row, self.iterator_args_ctx)
|
|
@@ -52,13 +52,14 @@ class ComponentIterationNode(ExecNode):
|
|
|
52
52
|
if self.__non_nullable_args_specified(iterator_args):
|
|
53
53
|
iterator = self.view.get().iterator_cls(**iterator_args)
|
|
54
54
|
for pos, component_dict in enumerate(iterator):
|
|
55
|
-
output_row =
|
|
55
|
+
output_row = self.row_builder.make_row()
|
|
56
56
|
input_row.copy(output_row)
|
|
57
57
|
# we're expanding the input and need to add the iterator position to the pk
|
|
58
58
|
self.__populate_output_row(output_row, pos, component_dict)
|
|
59
|
+
output_batch.add_row(output_row)
|
|
59
60
|
if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
|
|
60
61
|
yield output_batch
|
|
61
|
-
output_batch = DataRowBatch(self.
|
|
62
|
+
output_batch = DataRowBatch(self.row_builder)
|
|
62
63
|
|
|
63
64
|
if len(output_batch) > 0:
|
|
64
65
|
yield output_batch
|