pixeltable 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +526 -197
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/insertable_table.py +9 -9
- pixeltable/catalog/schema_object.py +9 -4
- pixeltable/catalog/table.py +45 -53
- pixeltable/catalog/table_version.py +214 -155
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/tbl_ops.py +44 -0
- pixeltable/catalog/view.py +47 -60
- pixeltable/dataframe.py +18 -5
- pixeltable/env.py +21 -4
- pixeltable/exec/data_row_batch.py +3 -1
- pixeltable/exec/in_memory_data_node.py +6 -7
- pixeltable/exprs/column_ref.py +2 -1
- pixeltable/functions/gemini.py +4 -4
- pixeltable/functions/openai.py +1 -2
- pixeltable/functions/video.py +2 -6
- pixeltable/globals.py +50 -25
- pixeltable/io/datarows.py +2 -1
- pixeltable/io/pandas.py +1 -0
- pixeltable/io/table_data_conduit.py +12 -13
- pixeltable/iterators/audio.py +17 -8
- pixeltable/iterators/image.py +5 -2
- pixeltable/metadata/schema.py +38 -1
- pixeltable/store.py +22 -1
- pixeltable/utils/media_store.py +11 -0
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.4.dist-info}/METADATA +1 -1
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.4.dist-info}/RECORD +33 -32
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.4.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.4.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.4.dist-info}/entry_points.txt +0 -0
|
@@ -76,7 +76,7 @@ class TableVersionPath:
|
|
|
76
76
|
elif self._cached_tbl_version is not None:
|
|
77
77
|
return
|
|
78
78
|
|
|
79
|
-
with Catalog.get().begin_xact(for_write=False):
|
|
79
|
+
with Catalog.get().begin_xact(tbl_id=self.tbl_version.id, for_write=False):
|
|
80
80
|
self._cached_tbl_version = self.tbl_version.get()
|
|
81
81
|
|
|
82
82
|
def clear_cached_md(self) -> None:
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# This file contains all dataclasses related to schema.PendingTableOp:
|
|
2
|
+
# - TableOp: the container for each log entry
|
|
3
|
+
# - <>Op: the actual operation, which is performed by TableVersion.exec_op(); each <>Op class contains
|
|
4
|
+
# enough information for exec_op() to perform the operation without having to reference data outside of
|
|
5
|
+
# TableVersion
|
|
6
|
+
|
|
7
|
+
import dataclasses
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclasses.dataclass
|
|
12
|
+
class CreateStoreTableOp:
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclasses.dataclass
|
|
17
|
+
class LoadViewOp:
|
|
18
|
+
view_path: dict[str, Any] # needed to create the view load plan
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclasses.dataclass
|
|
22
|
+
class DeleteTableMdOp:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclasses.dataclass
|
|
27
|
+
class DeleteTableMediaFilesOp:
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclasses.dataclass
|
|
32
|
+
class DropStoreTableOp:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclasses.dataclass
|
|
37
|
+
class TableOp:
|
|
38
|
+
tbl_id: str # uuid.UUID
|
|
39
|
+
op_sn: int # sequence number within the update operation; [0, num_ops)
|
|
40
|
+
num_ops: int # total number of ops forming the update operation
|
|
41
|
+
needs_xact: bool # if True, op must be run as part of a transaction
|
|
42
|
+
|
|
43
|
+
create_store_table_op: Optional[CreateStoreTableOp] = None
|
|
44
|
+
load_view_op: Optional[LoadViewOp] = None
|
pixeltable/catalog/view.py
CHANGED
|
@@ -9,7 +9,6 @@ import pixeltable.exceptions as excs
|
|
|
9
9
|
import pixeltable.metadata.schema as md_schema
|
|
10
10
|
import pixeltable.type_system as ts
|
|
11
11
|
from pixeltable import catalog, exprs, func
|
|
12
|
-
from pixeltable.env import Env
|
|
13
12
|
from pixeltable.iterators import ComponentIterator
|
|
14
13
|
|
|
15
14
|
if TYPE_CHECKING:
|
|
@@ -19,9 +18,10 @@ if TYPE_CHECKING:
|
|
|
19
18
|
from .column import Column
|
|
20
19
|
from .globals import _POS_COLUMN_NAME, MediaValidation
|
|
21
20
|
from .table import Table
|
|
22
|
-
from .table_version import TableVersion
|
|
21
|
+
from .table_version import TableVersion, TableVersionMd
|
|
23
22
|
from .table_version_handle import TableVersionHandle
|
|
24
23
|
from .table_version_path import TableVersionPath
|
|
24
|
+
from .tbl_ops import CreateStoreTableOp, LoadViewOp, TableOp
|
|
25
25
|
from .update_status import UpdateStatus
|
|
26
26
|
|
|
27
27
|
if TYPE_CHECKING:
|
|
@@ -45,9 +45,18 @@ class View(Table):
|
|
|
45
45
|
if not snapshot_only:
|
|
46
46
|
self._tbl_version = tbl_version_path.tbl_version
|
|
47
47
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
def _display_name(self) -> str:
|
|
49
|
+
name: str
|
|
50
|
+
if self._tbl_version_path.is_snapshot():
|
|
51
|
+
name = 'snapshot'
|
|
52
|
+
elif self._tbl_version_path.is_view():
|
|
53
|
+
name = 'view'
|
|
54
|
+
else:
|
|
55
|
+
assert self._tbl_version_path.is_replica()
|
|
56
|
+
name = 'table'
|
|
57
|
+
if self._tbl_version_path.is_replica():
|
|
58
|
+
name = f'{name}-replica'
|
|
59
|
+
return name
|
|
51
60
|
|
|
52
61
|
@classmethod
|
|
53
62
|
def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
|
|
@@ -80,7 +89,7 @@ class View(Table):
|
|
|
80
89
|
media_validation: MediaValidation,
|
|
81
90
|
iterator_cls: Optional[type[ComponentIterator]],
|
|
82
91
|
iterator_args: Optional[dict],
|
|
83
|
-
) ->
|
|
92
|
+
) -> tuple[TableVersionMd, Optional[list[TableOp]]]:
|
|
84
93
|
from pixeltable.plan import SampleClause
|
|
85
94
|
|
|
86
95
|
# Convert select_list to more additional_columns if present
|
|
@@ -167,11 +176,10 @@ class View(Table):
|
|
|
167
176
|
for col in columns:
|
|
168
177
|
if col.name in iterator_col_names:
|
|
169
178
|
raise excs.Error(
|
|
170
|
-
f'Duplicate name: column {col.name} is already present in the iterator output schema'
|
|
179
|
+
f'Duplicate name: column {col.name!r} is already present in the iterator output schema'
|
|
171
180
|
)
|
|
172
181
|
columns = iterator_cols + columns
|
|
173
182
|
|
|
174
|
-
session = Env.get().session
|
|
175
183
|
from pixeltable.exprs import InlineDict
|
|
176
184
|
|
|
177
185
|
iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
|
|
@@ -200,54 +208,26 @@ class View(Table):
|
|
|
200
208
|
iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
|
|
201
209
|
)
|
|
202
210
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
name,
|
|
206
|
-
columns,
|
|
207
|
-
num_retained_versions,
|
|
208
|
-
comment,
|
|
209
|
-
media_validation=media_validation,
|
|
210
|
-
# base_path=base_version_path,
|
|
211
|
-
view_md=view_md,
|
|
211
|
+
md = TableVersion.create_initial_md(
|
|
212
|
+
name, columns, num_retained_versions, comment, media_validation=media_validation, view_md=view_md
|
|
212
213
|
)
|
|
213
|
-
if
|
|
214
|
-
# this is purely a snapshot:
|
|
215
|
-
|
|
216
|
-
_logger.info(f'created snapshot {name}')
|
|
214
|
+
if md.tbl_md.is_pure_snapshot:
|
|
215
|
+
# this is purely a snapshot: no store table to create or load
|
|
216
|
+
return md, None
|
|
217
217
|
else:
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
name,
|
|
222
|
-
TableVersionPath(
|
|
223
|
-
TableVersionHandle(tbl_version.id, tbl_version.effective_version), base=base_version_path
|
|
224
|
-
),
|
|
225
|
-
snapshot_only=False,
|
|
226
|
-
)
|
|
227
|
-
_logger.info(f'Created view `{name}`, id={tbl_version.id}')
|
|
228
|
-
|
|
229
|
-
from pixeltable.plan import Planner
|
|
230
|
-
|
|
231
|
-
try:
|
|
232
|
-
plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
|
|
233
|
-
_, row_counts = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
|
|
234
|
-
status = UpdateStatus(row_count_stats=row_counts)
|
|
235
|
-
tbl_version._write_md_update_status(0, update_status=status)
|
|
236
|
-
|
|
237
|
-
except:
|
|
238
|
-
# we need to remove the orphaned TableVersion instance
|
|
239
|
-
del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
|
|
240
|
-
base_tbl_version = base.tbl_version.get()
|
|
241
|
-
if tbl_version.effective_version is None and not base_tbl_version.is_snapshot:
|
|
242
|
-
# also remove tbl_version from the base
|
|
243
|
-
base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
|
|
244
|
-
raise
|
|
245
|
-
Env.get().console_logger.info(
|
|
246
|
-
f'Created view `{name}` with {status.num_rows} rows, {status.num_excs} exceptions.'
|
|
218
|
+
tbl_id = md.tbl_md.tbl_id
|
|
219
|
+
view_path = TableVersionPath(
|
|
220
|
+
TableVersionHandle(UUID(tbl_id), effective_version=0 if is_snapshot else None), base=base_version_path
|
|
247
221
|
)
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
222
|
+
ops = [
|
|
223
|
+
TableOp(
|
|
224
|
+
tbl_id=tbl_id, op_sn=0, num_ops=2, needs_xact=False, create_store_table_op=CreateStoreTableOp()
|
|
225
|
+
),
|
|
226
|
+
TableOp(
|
|
227
|
+
tbl_id=tbl_id, op_sn=1, num_ops=2, needs_xact=True, load_view_op=LoadViewOp(view_path.as_dict())
|
|
228
|
+
),
|
|
229
|
+
]
|
|
230
|
+
return md, ops
|
|
251
231
|
|
|
252
232
|
@classmethod
|
|
253
233
|
def _verify_column(cls, col: Column) -> None:
|
|
@@ -280,8 +260,11 @@ class View(Table):
|
|
|
280
260
|
md['is_view'] = True
|
|
281
261
|
md['is_snapshot'] = self._tbl_version_path.is_snapshot()
|
|
282
262
|
base_tbl = self._get_base_table()
|
|
283
|
-
|
|
284
|
-
|
|
263
|
+
if base_tbl is None:
|
|
264
|
+
md['base'] = None
|
|
265
|
+
else:
|
|
266
|
+
base_version = self._effective_base_versions[0]
|
|
267
|
+
md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
|
|
285
268
|
return md
|
|
286
269
|
|
|
287
270
|
def insert(
|
|
@@ -295,16 +278,21 @@ class View(Table):
|
|
|
295
278
|
print_stats: bool = False,
|
|
296
279
|
**kwargs: Any,
|
|
297
280
|
) -> UpdateStatus:
|
|
298
|
-
raise excs.Error(f'{self.
|
|
281
|
+
raise excs.Error(f'{self._display_str()}: Cannot insert into a {self._display_name()}.')
|
|
299
282
|
|
|
300
283
|
def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
|
|
301
|
-
raise excs.Error(f'{self.
|
|
284
|
+
raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
|
|
302
285
|
|
|
303
286
|
def _get_base_table(self) -> Optional['Table']:
|
|
287
|
+
if self._tbl_version_path.base is None and not self._snapshot_only:
|
|
288
|
+
return None # this can happen for a replica of a base table
|
|
304
289
|
# if this is a pure snapshot, our tbl_version_path only reflects the base (there is no TableVersion instance
|
|
305
290
|
# for the snapshot itself)
|
|
291
|
+
from pixeltable.catalog import Catalog
|
|
292
|
+
|
|
306
293
|
base_id = self._tbl_version_path.tbl_id if self._snapshot_only else self._tbl_version_path.base.tbl_id
|
|
307
|
-
|
|
294
|
+
with Catalog.get().begin_xact(tbl_id=base_id, for_write=False):
|
|
295
|
+
return catalog.Catalog.get().get_table_by_id(base_id)
|
|
308
296
|
|
|
309
297
|
@property
|
|
310
298
|
def _effective_base_versions(self) -> list[Optional[int]]:
|
|
@@ -315,8 +303,7 @@ class View(Table):
|
|
|
315
303
|
return effective_versions[1:]
|
|
316
304
|
|
|
317
305
|
def _table_descriptor(self) -> str:
|
|
318
|
-
|
|
319
|
-
result = [f'{display_name} {self._path()!r}']
|
|
306
|
+
result = [self._display_str()]
|
|
320
307
|
bases_descrs: list[str] = []
|
|
321
308
|
for base, effective_version in zip(self._get_base_tables(), self._effective_base_versions):
|
|
322
309
|
if effective_version is None:
|
pixeltable/dataframe.py
CHANGED
|
@@ -1185,7 +1185,7 @@ class DataFrame:
|
|
|
1185
1185
|
"""
|
|
1186
1186
|
self._validate_mutable('delete', False)
|
|
1187
1187
|
if not self._first_tbl.is_insertable():
|
|
1188
|
-
raise excs.Error('Cannot delete
|
|
1188
|
+
raise excs.Error('Cannot use `delete` on a view.')
|
|
1189
1189
|
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
|
|
1190
1190
|
return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
|
|
1191
1191
|
|
|
@@ -1196,14 +1196,27 @@ class DataFrame:
|
|
|
1196
1196
|
op_name: The name of the operation for which the test is being performed.
|
|
1197
1197
|
allow_select: If True, allow a select() specification in the Dataframe.
|
|
1198
1198
|
"""
|
|
1199
|
+
self._validate_mutable_op_sequence(op_name, allow_select)
|
|
1200
|
+
|
|
1201
|
+
# TODO: Reconcile these with Table.__check_mutable()
|
|
1202
|
+
assert len(self._from_clause.tbls) == 1
|
|
1203
|
+
if self._first_tbl.is_snapshot():
|
|
1204
|
+
raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
|
|
1205
|
+
if self._first_tbl.is_replica():
|
|
1206
|
+
raise excs.Error(f'Cannot use `{op_name}` on a replica.')
|
|
1207
|
+
|
|
1208
|
+
def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
|
|
1209
|
+
"""Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
|
|
1199
1210
|
if self.group_by_clause is not None or self.grouping_tbl is not None:
|
|
1200
|
-
raise excs.Error(f'Cannot use `{op_name}` after `group_by
|
|
1211
|
+
raise excs.Error(f'Cannot use `{op_name}` after `group_by`.')
|
|
1201
1212
|
if self.order_by_clause is not None:
|
|
1202
|
-
raise excs.Error(f'Cannot use `{op_name}` after `order_by
|
|
1213
|
+
raise excs.Error(f'Cannot use `{op_name}` after `order_by`.')
|
|
1203
1214
|
if self.select_list is not None and not allow_select:
|
|
1204
|
-
raise excs.Error(f'Cannot use `{op_name}` after `select
|
|
1215
|
+
raise excs.Error(f'Cannot use `{op_name}` after `select`.')
|
|
1205
1216
|
if self.limit_val is not None:
|
|
1206
|
-
raise excs.Error(f'Cannot use `{op_name}` after `limit
|
|
1217
|
+
raise excs.Error(f'Cannot use `{op_name}` after `limit`.')
|
|
1218
|
+
if self._has_joins():
|
|
1219
|
+
raise excs.Error(f'Cannot use `{op_name}` after `join`.')
|
|
1207
1220
|
|
|
1208
1221
|
def as_dict(self) -> dict[str, Any]:
|
|
1209
1222
|
"""
|
pixeltable/env.py
CHANGED
|
@@ -20,7 +20,7 @@ from contextlib import contextmanager
|
|
|
20
20
|
from dataclasses import dataclass, field
|
|
21
21
|
from pathlib import Path
|
|
22
22
|
from sys import stdout
|
|
23
|
-
from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
|
|
23
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Optional, TypeVar
|
|
24
24
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
25
25
|
|
|
26
26
|
import nest_asyncio # type: ignore[import-untyped]
|
|
@@ -86,6 +86,7 @@ class Env:
|
|
|
86
86
|
_resource_pool_info: dict[str, Any]
|
|
87
87
|
_current_conn: Optional[sql.Connection]
|
|
88
88
|
_current_session: Optional[sql.orm.Session]
|
|
89
|
+
_current_isolation_level: Optional[Literal['REPEATABLE_READ', 'SERIALIZABLE']]
|
|
89
90
|
_dbms: Optional[Dbms]
|
|
90
91
|
_event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
|
|
91
92
|
|
|
@@ -99,6 +100,7 @@ class Env:
|
|
|
99
100
|
def _init_env(cls, reinit_db: bool = False) -> None:
|
|
100
101
|
assert not cls.__initializing, 'Circular env initialization detected.'
|
|
101
102
|
cls.__initializing = True
|
|
103
|
+
cls._instance = None
|
|
102
104
|
env = Env()
|
|
103
105
|
env._set_up(reinit_db=reinit_db)
|
|
104
106
|
env._upgrade_metadata()
|
|
@@ -142,6 +144,7 @@ class Env:
|
|
|
142
144
|
self._resource_pool_info = {}
|
|
143
145
|
self._current_conn = None
|
|
144
146
|
self._current_session = None
|
|
147
|
+
self._current_isolation_level = None
|
|
145
148
|
self._dbms = None
|
|
146
149
|
self._event_loop = None
|
|
147
150
|
|
|
@@ -230,20 +233,34 @@ class Env:
|
|
|
230
233
|
return self._db_server is not None
|
|
231
234
|
|
|
232
235
|
@contextmanager
|
|
233
|
-
def begin_xact(self) -> Iterator[sql.Connection]:
|
|
234
|
-
"""
|
|
236
|
+
def begin_xact(self, for_write: bool = False) -> Iterator[sql.Connection]:
|
|
237
|
+
"""
|
|
238
|
+
Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
|
|
239
|
+
|
|
240
|
+
for_write: if True, uses serializable isolation; if False, uses repeatable_read
|
|
241
|
+
|
|
242
|
+
TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
|
|
243
|
+
that avoids tripping over any pending ops
|
|
244
|
+
"""
|
|
235
245
|
if self._current_conn is None:
|
|
236
246
|
assert self._current_session is None
|
|
237
247
|
try:
|
|
238
|
-
|
|
248
|
+
self._current_isolation_level = 'SERIALIZABLE' if for_write else 'REPEATABLE_READ'
|
|
249
|
+
with (
|
|
250
|
+
self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
|
|
251
|
+
sql.orm.Session(conn) as session,
|
|
252
|
+
conn.begin(),
|
|
253
|
+
):
|
|
239
254
|
self._current_conn = conn
|
|
240
255
|
self._current_session = session
|
|
241
256
|
yield conn
|
|
242
257
|
finally:
|
|
243
258
|
self._current_session = None
|
|
244
259
|
self._current_conn = None
|
|
260
|
+
self._current_isolation_level = None
|
|
245
261
|
else:
|
|
246
262
|
assert self._current_session is not None
|
|
263
|
+
assert for_write == (self._current_isolation_level == 'serializable')
|
|
247
264
|
yield self._current_conn
|
|
248
265
|
|
|
249
266
|
def configure_logging(
|
|
@@ -90,7 +90,9 @@ class DataRowBatch:
|
|
|
90
90
|
idx_range = slice(0, len(self.rows))
|
|
91
91
|
for row in self.rows[idx_range]:
|
|
92
92
|
for info in stored_img_info:
|
|
93
|
-
|
|
93
|
+
col = info.col
|
|
94
|
+
assert col.tbl.id == self.tbl.id
|
|
95
|
+
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
|
|
94
96
|
row.flush_img(info.slot_idx, filepath)
|
|
95
97
|
for slot_idx in flushed_slot_idxs:
|
|
96
98
|
row.flush_img(slot_idx)
|
|
@@ -63,13 +63,12 @@ class InMemoryDataNode(ExecNode):
|
|
|
63
63
|
for col_name, val in input_row.items():
|
|
64
64
|
col_info = user_cols_by_name.get(col_name)
|
|
65
65
|
assert col_info is not None
|
|
66
|
-
|
|
67
|
-
if
|
|
68
|
-
# this is a literal
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
self.output_rows[row_idx][col_info.slot_idx] = path
|
|
66
|
+
col = col_info.col
|
|
67
|
+
if col.col_type.is_image_type() and isinstance(val, bytes):
|
|
68
|
+
# this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
|
|
69
|
+
assert col.tbl.id == self.tbl.id
|
|
70
|
+
path = MediaStore.save_media_file(val, col.tbl.id, col.id, col.tbl.version)
|
|
71
|
+
self.output_rows[row_idx][col_info.slot_idx] = str(path)
|
|
73
72
|
else:
|
|
74
73
|
self.output_rows[row_idx][col_info.slot_idx] = val
|
|
75
74
|
|
pixeltable/exprs/column_ref.py
CHANGED
|
@@ -325,7 +325,8 @@ class ColumnRef(Expr):
|
|
|
325
325
|
@classmethod
|
|
326
326
|
def get_column(cls, d: dict) -> catalog.Column:
|
|
327
327
|
tbl_id, version, col_id = UUID(d['tbl_id']), d['tbl_version'], d['col_id']
|
|
328
|
-
|
|
328
|
+
# validate_initialized=False: this gets called as part of TableVersion.init()
|
|
329
|
+
tbl_version = catalog.Catalog.get().get_tbl_version(tbl_id, version, validate_initialized=False)
|
|
329
330
|
# don't use tbl_version.cols_by_id here, this might be a snapshot reference to a column that was then dropped
|
|
330
331
|
col = next(col for col in tbl_version.cols if col.id == col_id)
|
|
331
332
|
return col
|
pixeltable/functions/gemini.py
CHANGED
|
@@ -7,7 +7,6 @@ the [Working with Gemini](https://pixeltable.readme.io/docs/working-with-gemini)
|
|
|
7
7
|
|
|
8
8
|
import asyncio
|
|
9
9
|
import io
|
|
10
|
-
import tempfile
|
|
11
10
|
from pathlib import Path
|
|
12
11
|
from typing import TYPE_CHECKING, Optional
|
|
13
12
|
|
|
@@ -215,9 +214,10 @@ async def generate_videos(
|
|
|
215
214
|
video_bytes = await _genai_client().aio.files.download(file=video.video) # type: ignore[arg-type]
|
|
216
215
|
assert video_bytes is not None
|
|
217
216
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
217
|
+
# Create a temporary file to store the video bytes
|
|
218
|
+
output_path = env.Env.get().create_tmp_path('.mp4')
|
|
219
|
+
Path(output_path).write_bytes(video_bytes)
|
|
220
|
+
return str(output_path)
|
|
221
221
|
|
|
222
222
|
|
|
223
223
|
@generate_videos.resource_pool
|
pixeltable/functions/openai.py
CHANGED
|
@@ -13,7 +13,6 @@ import logging
|
|
|
13
13
|
import math
|
|
14
14
|
import pathlib
|
|
15
15
|
import re
|
|
16
|
-
import uuid
|
|
17
16
|
from typing import TYPE_CHECKING, Any, Callable, Optional, Type
|
|
18
17
|
|
|
19
18
|
import httpx
|
|
@@ -207,7 +206,7 @@ async def speech(input: str, *, model: str, voice: str, model_kwargs: Optional[d
|
|
|
207
206
|
|
|
208
207
|
content = await _openai_client().audio.speech.create(input=input, model=model, voice=voice, **model_kwargs)
|
|
209
208
|
ext = model_kwargs.get('response_format', 'mp3')
|
|
210
|
-
output_filename = str(env.Env.get().
|
|
209
|
+
output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
|
|
211
210
|
content.write_to_file(output_filename)
|
|
212
211
|
return output_filename
|
|
213
212
|
|
pixeltable/functions/video.py
CHANGED
|
@@ -2,9 +2,6 @@
|
|
|
2
2
|
Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import tempfile
|
|
6
|
-
import uuid
|
|
7
|
-
from pathlib import Path
|
|
8
5
|
from typing import Any, Optional
|
|
9
6
|
|
|
10
7
|
import av
|
|
@@ -59,8 +56,7 @@ class make_video(pxt.Aggregator):
|
|
|
59
56
|
if frame is None:
|
|
60
57
|
return
|
|
61
58
|
if self.container is None:
|
|
62
|
-
|
|
63
|
-
self.out_file = Path(output_filename)
|
|
59
|
+
self.out_file = env.Env.get().create_tmp_path('.mp4')
|
|
64
60
|
self.container = av.open(str(self.out_file), mode='w')
|
|
65
61
|
self.stream = self.container.add_stream('h264', rate=self.fps)
|
|
66
62
|
self.stream.pix_fmt = 'yuv420p'
|
|
@@ -109,7 +105,7 @@ def extract_audio(
|
|
|
109
105
|
return None
|
|
110
106
|
audio_stream = container.streams.audio[stream_idx]
|
|
111
107
|
# create this in our tmp directory, so it'll get cleaned up if it's being generated as part of a query
|
|
112
|
-
output_filename = str(env.Env.get().
|
|
108
|
+
output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
|
|
113
109
|
|
|
114
110
|
with av.open(output_filename, 'w', format=format) as output_container:
|
|
115
111
|
output_stream = output_container.add_stream(codec or default_codec)
|
pixeltable/globals.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Un
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from pandas.io.formats.style import Styler
|
|
10
10
|
|
|
11
|
-
from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
|
|
11
|
+
from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share, type_system as ts
|
|
12
12
|
from pixeltable.catalog import Catalog, TableVersionPath
|
|
13
13
|
from pixeltable.catalog.insertable_table import OnErrorParameter
|
|
14
14
|
from pixeltable.config import Config
|
|
@@ -44,7 +44,7 @@ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
|
|
|
44
44
|
|
|
45
45
|
|
|
46
46
|
def create_table(
|
|
47
|
-
|
|
47
|
+
path: str,
|
|
48
48
|
schema: Optional[dict[str, Any]] = None,
|
|
49
49
|
*,
|
|
50
50
|
source: Optional[TableDataSource] = None,
|
|
@@ -58,14 +58,24 @@ def create_table(
|
|
|
58
58
|
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
59
59
|
extra_args: Optional[dict[str, Any]] = None, # Additional arguments to data source provider
|
|
60
60
|
) -> catalog.Table:
|
|
61
|
-
"""Create a new base table.
|
|
61
|
+
"""Create a new base table. Exactly one of `schema` or `source` must be provided.
|
|
62
|
+
|
|
63
|
+
If a `schema` is provided, then an empty table will be created with the specified schema.
|
|
64
|
+
|
|
65
|
+
If a `source` is provided, then Pixeltable will attempt to infer a data source format and table schema from the
|
|
66
|
+
contents of the specified data, and the data will be imported from the specified source into the new table. The
|
|
67
|
+
source format and/or schema can be specified directly via the `source_format` and `schema_overrides` parameters.
|
|
62
68
|
|
|
63
69
|
Args:
|
|
64
|
-
|
|
65
|
-
schema:
|
|
66
|
-
source: A data source
|
|
67
|
-
source_format:
|
|
68
|
-
|
|
70
|
+
path: Pixeltable path (qualified name) of the table, such as `'my_table'` or `'my_dir.my_subdir.my_table'`.
|
|
71
|
+
schema: Schema for the new table, mapping column names to Pixeltable types.
|
|
72
|
+
source: A data source (file, URL, DataFrame, or list of rows) to import from.
|
|
73
|
+
source_format: Must be used in conjunction with a `source`.
|
|
74
|
+
If specified, then the given format will be used to read the source data. (Otherwise,
|
|
75
|
+
Pixeltable will attempt to infer the format from the source data.)
|
|
76
|
+
schema_overrides: Must be used in conjunction with a `source`.
|
|
77
|
+
If specified, then columns in `schema_overrides` will be given the specified types.
|
|
78
|
+
(Pixeltable will attempt to infer the types of any columns not specified.)
|
|
69
79
|
on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
|
|
70
80
|
invalid media file (such as a corrupt image) for one of the inserted rows.
|
|
71
81
|
|
|
@@ -81,14 +91,15 @@ def create_table(
|
|
|
81
91
|
|
|
82
92
|
- `'on_read'`: validate media files at query time
|
|
83
93
|
- `'on_write'`: validate media files during insert/update operations
|
|
84
|
-
if_exists:
|
|
85
|
-
Must be one of the following:
|
|
94
|
+
if_exists: Determines the behavior if a table already exists at the specified path location.
|
|
86
95
|
|
|
87
96
|
- `'error'`: raise an error
|
|
88
97
|
- `'ignore'`: do nothing and return the existing table handle
|
|
89
|
-
- `'replace'`: if the existing table has no views, drop and replace it with a new one
|
|
90
|
-
|
|
91
|
-
|
|
98
|
+
- `'replace'`: if the existing table has no views or snapshots, drop and replace it with a new one;
|
|
99
|
+
raise an error if the existing table has views or snapshots
|
|
100
|
+
- `'replace_force'`: drop the existing table and all its views and snapshots, and create a new one
|
|
101
|
+
extra_args: Must be used in conjunction with a `source`. If specified, then additional arguments will be
|
|
102
|
+
passed along to the source data provider.
|
|
92
103
|
|
|
93
104
|
Returns:
|
|
94
105
|
A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
|
|
@@ -114,7 +125,7 @@ def create_table(
|
|
|
114
125
|
>>> tbl1 = pxt.get_table('orig_table')
|
|
115
126
|
... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
|
|
116
127
|
|
|
117
|
-
Create a table if does not already exist, otherwise get the existing table:
|
|
128
|
+
Create a table if it does not already exist, otherwise get the existing table:
|
|
118
129
|
|
|
119
130
|
>>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String}, if_exists='ignore')
|
|
120
131
|
|
|
@@ -130,12 +141,12 @@ def create_table(
|
|
|
130
141
|
from pixeltable.io.utils import normalize_primary_key_parameter
|
|
131
142
|
|
|
132
143
|
if (schema is None) == (source is None):
|
|
133
|
-
raise excs.Error('
|
|
144
|
+
raise excs.Error('Either a `schema` or a `source` must be provided (but not both)')
|
|
134
145
|
|
|
135
146
|
if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
|
|
136
147
|
raise excs.Error('`schema` must be a non-empty dictionary')
|
|
137
148
|
|
|
138
|
-
path_obj = catalog.Path(
|
|
149
|
+
path_obj = catalog.Path(path)
|
|
139
150
|
if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
140
151
|
media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
|
|
141
152
|
primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
|
|
@@ -146,7 +157,14 @@ def create_table(
|
|
|
146
157
|
tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
|
|
147
158
|
tds.check_source_format()
|
|
148
159
|
data_source = tds.specialize()
|
|
149
|
-
|
|
160
|
+
src_schema_overrides: dict[str, ts.ColumnType] = {}
|
|
161
|
+
if schema_overrides is not None:
|
|
162
|
+
for col_name, py_type in schema_overrides.items():
|
|
163
|
+
col_type = ts.ColumnType.normalize_type(py_type, nullable_default=True, allow_builtin_types=False)
|
|
164
|
+
if col_type is None:
|
|
165
|
+
raise excs.Error(f'Invalid type for column {col_name!r} in `schema_overrides`: {py_type}')
|
|
166
|
+
src_schema_overrides[col_name] = col_type
|
|
167
|
+
data_source.src_schema_overrides = src_schema_overrides
|
|
150
168
|
data_source.src_pk = primary_key
|
|
151
169
|
data_source.infer_schema()
|
|
152
170
|
schema = data_source.pxt_schema
|
|
@@ -255,9 +273,7 @@ def create_view(
|
|
|
255
273
|
tbl_version_path = base._tbl_version_path
|
|
256
274
|
sample_clause = None
|
|
257
275
|
elif isinstance(base, DataFrame):
|
|
258
|
-
base.
|
|
259
|
-
if len(base._from_clause.tbls) > 1:
|
|
260
|
-
raise excs.Error('Cannot create a view of a join')
|
|
276
|
+
base._validate_mutable_op_sequence('create_view', allow_select=True)
|
|
261
277
|
tbl_version_path = base._from_clause.tbls[0]
|
|
262
278
|
where = base.where_clause
|
|
263
279
|
sample_clause = base.sample_clause
|
|
@@ -537,9 +553,12 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
|
537
553
|
|
|
538
554
|
>>> pxt.list_tables('dir1')
|
|
539
555
|
"""
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
556
|
+
return _list_tables(dir_path, recursive=recursive, allow_system_paths=False)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths: bool = False) -> list[str]:
|
|
560
|
+
path_obj = catalog.Path(dir_path, empty_is_valid=True, allow_system_paths=allow_system_paths)
|
|
561
|
+
contents = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
|
|
543
562
|
return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
|
|
544
563
|
|
|
545
564
|
|
|
@@ -647,13 +666,16 @@ def ls(path: str = '') -> pd.DataFrame:
|
|
|
647
666
|
To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
|
|
648
667
|
[list_dirs()][pixeltable.list_dirs] instead.
|
|
649
668
|
"""
|
|
669
|
+
from pixeltable.catalog import retry_loop
|
|
650
670
|
from pixeltable.metadata import schema
|
|
651
671
|
|
|
652
672
|
cat = Catalog.get()
|
|
653
673
|
path_obj = catalog.Path(path, empty_is_valid=True)
|
|
654
674
|
dir_entries = cat.get_dir_contents(path_obj)
|
|
655
|
-
|
|
656
|
-
|
|
675
|
+
|
|
676
|
+
@retry_loop(for_write=False)
|
|
677
|
+
def op() -> list[list[str]]:
|
|
678
|
+
rows: list[list[str]] = []
|
|
657
679
|
for name, entry in dir_entries.items():
|
|
658
680
|
if name.startswith('_'):
|
|
659
681
|
continue
|
|
@@ -679,6 +701,9 @@ def ls(path: str = '') -> pd.DataFrame:
|
|
|
679
701
|
if md['is_replica']:
|
|
680
702
|
kind = f'{kind}-replica'
|
|
681
703
|
rows.append([name, kind, version, base])
|
|
704
|
+
return rows
|
|
705
|
+
|
|
706
|
+
rows = op()
|
|
682
707
|
|
|
683
708
|
rows = sorted(rows, key=lambda x: x[0])
|
|
684
709
|
df = pd.DataFrame(
|
pixeltable/io/datarows.py
CHANGED
|
@@ -8,7 +8,7 @@ from pixeltable import exceptions as excs
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def _infer_schema_from_rows(
|
|
11
|
-
rows: Iterable[dict[str, Any]], schema_overrides: dict[str,
|
|
11
|
+
rows: Iterable[dict[str, Any]], schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
|
|
12
12
|
) -> dict[str, ts.ColumnType]:
|
|
13
13
|
schema: dict[str, ts.ColumnType] = {}
|
|
14
14
|
cols_with_nones: set[str] = set()
|
|
@@ -20,6 +20,7 @@ def _infer_schema_from_rows(
|
|
|
20
20
|
# in which the column names are encountered in the input data, even if `schema_overrides`
|
|
21
21
|
# is specified.
|
|
22
22
|
if col_name not in schema:
|
|
23
|
+
assert isinstance(schema_overrides[col_name], ts.ColumnType)
|
|
23
24
|
schema[col_name] = schema_overrides[col_name]
|
|
24
25
|
elif value is not None:
|
|
25
26
|
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
pixeltable/io/pandas.py
CHANGED
|
@@ -132,6 +132,7 @@ def df_infer_schema(
|
|
|
132
132
|
pd_schema: dict[str, ts.ColumnType] = {}
|
|
133
133
|
for pd_name, pd_dtype in zip(df.columns, df.dtypes):
|
|
134
134
|
if pd_name in schema_overrides:
|
|
135
|
+
assert isinstance(schema_overrides[pd_name], ts.ColumnType)
|
|
135
136
|
pxt_type = schema_overrides[pd_name]
|
|
136
137
|
else:
|
|
137
138
|
pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
|