pixeltable 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +2 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/column.py +41 -29
- pixeltable/catalog/globals.py +18 -0
- pixeltable/catalog/insertable_table.py +30 -10
- pixeltable/catalog/table.py +198 -86
- pixeltable/catalog/table_version.py +47 -53
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +17 -18
- pixeltable/dataframe.py +27 -36
- pixeltable/env.py +7 -0
- pixeltable/exec/__init__.py +0 -1
- pixeltable/exec/aggregation_node.py +6 -3
- pixeltable/exec/cache_prefetch_node.py +189 -43
- pixeltable/exec/data_row_batch.py +5 -22
- pixeltable/exec/exec_context.py +2 -2
- pixeltable/exec/exec_node.py +3 -2
- pixeltable/exec/expr_eval_node.py +23 -16
- pixeltable/exec/in_memory_data_node.py +6 -3
- pixeltable/exec/sql_node.py +24 -25
- pixeltable/exprs/arithmetic_expr.py +12 -5
- pixeltable/exprs/array_slice.py +7 -7
- pixeltable/exprs/column_property_ref.py +37 -10
- pixeltable/exprs/column_ref.py +97 -14
- pixeltable/exprs/comparison.py +10 -5
- pixeltable/exprs/compound_predicate.py +8 -7
- pixeltable/exprs/data_row.py +27 -18
- pixeltable/exprs/expr.py +53 -52
- pixeltable/exprs/expr_set.py +5 -0
- pixeltable/exprs/function_call.py +32 -16
- pixeltable/exprs/globals.py +4 -1
- pixeltable/exprs/in_predicate.py +8 -7
- pixeltable/exprs/inline_expr.py +4 -4
- pixeltable/exprs/is_null.py +4 -4
- pixeltable/exprs/json_mapper.py +11 -12
- pixeltable/exprs/json_path.py +6 -11
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +5 -4
- pixeltable/exprs/object_ref.py +2 -1
- pixeltable/exprs/row_builder.py +88 -36
- pixeltable/exprs/rowid_ref.py +12 -11
- pixeltable/exprs/similarity_expr.py +12 -7
- pixeltable/exprs/sql_element_cache.py +7 -5
- pixeltable/exprs/type_cast.py +8 -6
- pixeltable/exprs/variable.py +5 -4
- pixeltable/func/aggregate_function.py +9 -9
- pixeltable/func/expr_template_function.py +6 -5
- pixeltable/func/function.py +11 -10
- pixeltable/func/udf.py +6 -11
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/globals.py +5 -7
- pixeltable/functions/huggingface.py +155 -45
- pixeltable/functions/llama_cpp.py +107 -0
- pixeltable/functions/mistralai.py +1 -1
- pixeltable/functions/ollama.py +147 -0
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/replicate.py +72 -0
- pixeltable/functions/string.py +9 -0
- pixeltable/functions/together.py +1 -1
- pixeltable/functions/util.py +5 -2
- pixeltable/globals.py +67 -26
- pixeltable/index/btree.py +16 -3
- pixeltable/index/embedding_index.py +4 -4
- pixeltable/io/__init__.py +1 -2
- pixeltable/io/fiftyone.py +178 -0
- pixeltable/io/globals.py +96 -2
- pixeltable/iterators/base.py +3 -2
- pixeltable/iterators/document.py +1 -1
- pixeltable/iterators/video.py +120 -63
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_21.py +34 -0
- pixeltable/metadata/converters/util.py +45 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +8 -0
- pixeltable/plan.py +17 -15
- pixeltable/py.typed +0 -0
- pixeltable/store.py +7 -2
- pixeltable/tool/create_test_db_dump.py +1 -1
- pixeltable/tool/create_test_video.py +1 -1
- pixeltable/tool/embed_udf.py +1 -1
- pixeltable/tool/mypy_plugin.py +28 -5
- pixeltable/type_system.py +100 -36
- pixeltable/utils/coco.py +5 -5
- pixeltable/utils/documents.py +15 -1
- pixeltable/utils/formatter.py +12 -13
- pixeltable/utils/s3.py +6 -3
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/METADATA +158 -49
- pixeltable-0.2.23.dist-info/RECORD +153 -0
- pixeltable/exec/media_validation_node.py +0 -43
- pixeltable-0.2.21.dist-info/RECORD +0 -148
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/entry_points.txt +0 -0
|
@@ -26,7 +26,7 @@ from pixeltable.utils.media_store import MediaStore
|
|
|
26
26
|
|
|
27
27
|
from ..func.globals import resolve_symbol
|
|
28
28
|
from .column import Column
|
|
29
|
-
from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, UpdateStatus, is_valid_identifier
|
|
29
|
+
from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, UpdateStatus, is_valid_identifier, MediaValidation
|
|
30
30
|
|
|
31
31
|
if TYPE_CHECKING:
|
|
32
32
|
from pixeltable import exec, store
|
|
@@ -53,6 +53,7 @@ class TableVersion:
|
|
|
53
53
|
name: str
|
|
54
54
|
version: int
|
|
55
55
|
comment: str
|
|
56
|
+
media_validation: MediaValidation
|
|
56
57
|
num_retained_versions: int
|
|
57
58
|
schema_version: int
|
|
58
59
|
view_md: Optional[schema.ViewMd]
|
|
@@ -109,6 +110,7 @@ class TableVersion:
|
|
|
109
110
|
self.view_md = tbl_md.view_md # save this as-is, it's needed for _create_md()
|
|
110
111
|
is_view = tbl_md.view_md is not None
|
|
111
112
|
self.is_snapshot = (is_view and tbl_md.view_md.is_snapshot) or bool(is_snapshot)
|
|
113
|
+
self.media_validation = MediaValidation[schema_version_md.media_validation.upper()]
|
|
112
114
|
# a mutable TableVersion doesn't have a static version
|
|
113
115
|
self.effective_version = self.version if self.is_snapshot else None
|
|
114
116
|
|
|
@@ -182,7 +184,7 @@ class TableVersion:
|
|
|
182
184
|
@classmethod
|
|
183
185
|
def create(
|
|
184
186
|
cls, session: orm.Session, dir_id: UUID, name: str, cols: list[Column], num_retained_versions: int,
|
|
185
|
-
comment: str, base_path: Optional[pxt.catalog.TableVersionPath] = None,
|
|
187
|
+
comment: str, media_validation: MediaValidation, base_path: Optional[pxt.catalog.TableVersionPath] = None,
|
|
186
188
|
view_md: Optional[schema.ViewMd] = None
|
|
187
189
|
) -> tuple[UUID, Optional[TableVersion]]:
|
|
188
190
|
# assign ids
|
|
@@ -191,8 +193,6 @@ class TableVersion:
|
|
|
191
193
|
col.id = pos
|
|
192
194
|
col.schema_version_add = 0
|
|
193
195
|
cols_by_name[col.name] = col
|
|
194
|
-
if col.value_expr is None and col.compute_func is not None:
|
|
195
|
-
cls._create_value_expr(col, base_path)
|
|
196
196
|
if col.is_computed:
|
|
197
197
|
col.check_value_expr()
|
|
198
198
|
|
|
@@ -214,11 +214,17 @@ class TableVersion:
|
|
|
214
214
|
tbl_id=tbl_record.id, version=0, md=dataclasses.asdict(table_version_md))
|
|
215
215
|
|
|
216
216
|
# create schema.TableSchemaVersion
|
|
217
|
-
schema_col_md
|
|
217
|
+
schema_col_md: dict[int, schema.SchemaColumn] = {}
|
|
218
|
+
for pos, col in enumerate(cols):
|
|
219
|
+
md = schema.SchemaColumn(
|
|
220
|
+
pos=pos, name=col.name,
|
|
221
|
+
media_validation=col._media_validation.name.lower() if col._media_validation is not None else None)
|
|
222
|
+
schema_col_md[col.id] = md
|
|
218
223
|
|
|
219
224
|
schema_version_md = schema.TableSchemaVersionMd(
|
|
220
225
|
schema_version=0, preceding_schema_version=None, columns=schema_col_md,
|
|
221
|
-
num_retained_versions=num_retained_versions, comment=comment
|
|
226
|
+
num_retained_versions=num_retained_versions, comment=comment,
|
|
227
|
+
media_validation=media_validation.name.lower())
|
|
222
228
|
schema_version_record = schema.TableSchemaVersion(
|
|
223
229
|
tbl_id=tbl_record.id, schema_version=0, md=dataclasses.asdict(schema_version_md))
|
|
224
230
|
|
|
@@ -285,10 +291,15 @@ class TableVersion:
|
|
|
285
291
|
self.cols_by_name = {}
|
|
286
292
|
self.cols_by_id = {}
|
|
287
293
|
for col_md in tbl_md.column_md.values():
|
|
288
|
-
|
|
294
|
+
schema_col_md = schema_version_md.columns[col_md.id] if col_md.id in schema_version_md.columns else None
|
|
295
|
+
col_name = schema_col_md.name if schema_col_md is not None else None
|
|
296
|
+
media_val = (
|
|
297
|
+
MediaValidation[schema_col_md.media_validation.upper()]
|
|
298
|
+
if schema_col_md is not None and schema_col_md.media_validation is not None else None
|
|
299
|
+
)
|
|
289
300
|
col = Column(
|
|
290
301
|
col_id=col_md.id, name=col_name, col_type=ts.ColumnType.from_dict(col_md.col_type),
|
|
291
|
-
is_pk=col_md.is_pk, stored=col_md.stored,
|
|
302
|
+
is_pk=col_md.is_pk, stored=col_md.stored, media_validation=media_val,
|
|
292
303
|
schema_version_add=col_md.schema_version_add, schema_version_drop=col_md.schema_version_drop,
|
|
293
304
|
value_expr_dict=col_md.value_expr)
|
|
294
305
|
col.tbl = self
|
|
@@ -349,7 +360,8 @@ class TableVersion:
|
|
|
349
360
|
self.store_tbl = StoreTable(self)
|
|
350
361
|
|
|
351
362
|
def _update_md(
|
|
352
|
-
|
|
363
|
+
self, timestamp: float, conn: sql.engine.Connection, update_tbl_version: bool = True,
|
|
364
|
+
preceding_schema_version: Optional[int] = None
|
|
353
365
|
) -> None:
|
|
354
366
|
"""Writes table metadata to the database.
|
|
355
367
|
|
|
@@ -480,37 +492,35 @@ class TableVersion:
|
|
|
480
492
|
self._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
|
|
481
493
|
_logger.info(f'Dropped index {idx_md.name} on table {self.name}')
|
|
482
494
|
|
|
483
|
-
def
|
|
495
|
+
def add_columns(self, cols: Iterable[Column], print_stats: bool, on_error: Literal['abort', 'ignore']) -> UpdateStatus:
|
|
484
496
|
"""Adds a column to the table.
|
|
485
497
|
"""
|
|
486
498
|
assert not self.is_snapshot
|
|
487
|
-
assert is_valid_identifier(col.name)
|
|
488
|
-
assert col.stored is not None
|
|
489
|
-
assert col.name not in self.cols_by_name
|
|
490
|
-
col
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
if col.compute_func is not None:
|
|
495
|
-
# create value_expr from compute_func
|
|
496
|
-
self._create_value_expr(col, self.path)
|
|
499
|
+
assert all(is_valid_identifier(col.name) for col in cols)
|
|
500
|
+
assert all(col.stored is not None for col in cols)
|
|
501
|
+
assert all(col.name not in self.cols_by_name for col in cols)
|
|
502
|
+
for col in cols:
|
|
503
|
+
col.tbl = self
|
|
504
|
+
col.id = self.next_col_id
|
|
505
|
+
self.next_col_id += 1
|
|
497
506
|
|
|
498
507
|
# we're creating a new schema version
|
|
499
508
|
self.version += 1
|
|
500
509
|
preceding_schema_version = self.schema_version
|
|
501
510
|
self.schema_version = self.version
|
|
502
511
|
with Env.get().engine.begin() as conn:
|
|
503
|
-
status = self._add_columns(
|
|
504
|
-
|
|
512
|
+
status = self._add_columns(cols, conn, print_stats=print_stats, on_error=on_error)
|
|
513
|
+
for col in cols:
|
|
514
|
+
_ = self._add_default_index(col, conn)
|
|
505
515
|
self._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
|
|
506
|
-
_logger.info(f'Added
|
|
516
|
+
_logger.info(f'Added columns {[col.name for col in cols]} to table {self.name}, new version: {self.version}')
|
|
507
517
|
|
|
508
518
|
msg = (
|
|
509
519
|
f'Added {status.num_rows} column value{"" if status.num_rows == 1 else "s"} '
|
|
510
520
|
f'with {status.num_excs} error{"" if status.num_excs == 1 else "s"}.'
|
|
511
521
|
)
|
|
512
522
|
print(msg)
|
|
513
|
-
_logger.info(f'
|
|
523
|
+
_logger.info(f'Columns {[col.name for col in cols]}: {msg}')
|
|
514
524
|
return status
|
|
515
525
|
|
|
516
526
|
def _add_columns(
|
|
@@ -710,20 +720,22 @@ class TableVersion:
|
|
|
710
720
|
|
|
711
721
|
if conn is None:
|
|
712
722
|
with Env.get().engine.begin() as conn:
|
|
713
|
-
return self._insert(
|
|
723
|
+
return self._insert(
|
|
724
|
+
plan, conn, time.time(), print_stats=print_stats, rowids=rowids(), abort_on_exc=fail_on_exception)
|
|
714
725
|
else:
|
|
715
|
-
return self._insert(
|
|
726
|
+
return self._insert(
|
|
727
|
+
plan, conn, time.time(), print_stats=print_stats, rowids=rowids(), abort_on_exc=fail_on_exception)
|
|
716
728
|
|
|
717
729
|
def _insert(
|
|
718
730
|
self, exec_plan: 'exec.ExecNode', conn: sql.engine.Connection, timestamp: float, *,
|
|
719
|
-
rowids: Optional[Iterator[int]] = None, print_stats: bool = False,
|
|
731
|
+
rowids: Optional[Iterator[int]] = None, print_stats: bool = False, abort_on_exc: bool = False
|
|
720
732
|
) -> UpdateStatus:
|
|
721
733
|
"""Insert rows produced by exec_plan and propagate to views"""
|
|
722
734
|
# we're creating a new version
|
|
723
735
|
self.version += 1
|
|
724
736
|
result = UpdateStatus()
|
|
725
737
|
num_rows, num_excs, cols_with_excs = self.store_tbl.insert_rows(
|
|
726
|
-
exec_plan, conn, v_min=self.version, rowids=rowids)
|
|
738
|
+
exec_plan, conn, v_min=self.version, rowids=rowids, abort_on_exc=abort_on_exc)
|
|
727
739
|
result.num_rows = num_rows
|
|
728
740
|
result.num_excs = num_excs
|
|
729
741
|
result.num_computed_values += exec_plan.ctx.num_computed_exprs * num_rows
|
|
@@ -1124,28 +1136,6 @@ class TableVersion:
|
|
|
1124
1136
|
names = [c.name for c in self.cols_by_name.values() if c.is_computed]
|
|
1125
1137
|
return names
|
|
1126
1138
|
|
|
1127
|
-
@classmethod
|
|
1128
|
-
def _create_value_expr(cls, col: Column, path: pxt.catalog.TableVersionPath) -> None:
|
|
1129
|
-
"""
|
|
1130
|
-
Create col.value_expr, given col.compute_func.
|
|
1131
|
-
Interprets compute_func's parameters to be references to columns and construct ColumnRefs as args.
|
|
1132
|
-
Does not update Column.dependent_cols.
|
|
1133
|
-
"""
|
|
1134
|
-
assert col.value_expr is None
|
|
1135
|
-
assert col.compute_func is not None
|
|
1136
|
-
from pixeltable import exprs
|
|
1137
|
-
params = inspect.signature(col.compute_func).parameters
|
|
1138
|
-
args: list[exprs.ColumnRef] = []
|
|
1139
|
-
for param_name in params:
|
|
1140
|
-
param = path.get_column(param_name)
|
|
1141
|
-
if param is None:
|
|
1142
|
-
raise excs.Error(
|
|
1143
|
-
f'Column {col.name}: Callable parameter refers to an unknown column: {param_name}')
|
|
1144
|
-
args.append(exprs.ColumnRef(param))
|
|
1145
|
-
fn = func.make_function(
|
|
1146
|
-
col.compute_func, return_type=col.col_type, param_types=[arg.col_type for arg in args])
|
|
1147
|
-
col.set_value_expr(fn(*args))
|
|
1148
|
-
|
|
1149
1139
|
def _record_refd_columns(self, col: Column) -> None:
|
|
1150
1140
|
"""Update Column.dependent_cols for all cols referenced in col.value_expr.
|
|
1151
1141
|
"""
|
|
@@ -1203,7 +1193,8 @@ class TableVersion:
|
|
|
1203
1193
|
name=self.name, current_version=self.version, current_schema_version=self.schema_version,
|
|
1204
1194
|
next_col_id=self.next_col_id, next_idx_id=self.next_idx_id, next_row_id=self.next_rowid,
|
|
1205
1195
|
column_md=self._create_column_md(self.cols), index_md=self.idx_md,
|
|
1206
|
-
external_stores=self._create_stores_md(self.external_stores.values()), view_md=self.view_md
|
|
1196
|
+
external_stores=self._create_stores_md(self.external_stores.values()), view_md=self.view_md,
|
|
1197
|
+
)
|
|
1207
1198
|
|
|
1208
1199
|
def _create_version_md(self, timestamp: float) -> schema.TableVersionMd:
|
|
1209
1200
|
return schema.TableVersionMd(created_at=timestamp, version=self.version, schema_version=self.schema_version)
|
|
@@ -1211,11 +1202,14 @@ class TableVersion:
|
|
|
1211
1202
|
def _create_schema_version_md(self, preceding_schema_version: int) -> schema.TableSchemaVersionMd:
|
|
1212
1203
|
column_md: dict[int, schema.SchemaColumn] = {}
|
|
1213
1204
|
for pos, col in enumerate(self.cols_by_name.values()):
|
|
1214
|
-
column_md[col.id] = schema.SchemaColumn(
|
|
1205
|
+
column_md[col.id] = schema.SchemaColumn(
|
|
1206
|
+
pos=pos, name=col.name,
|
|
1207
|
+
media_validation=col._media_validation.name.lower() if col._media_validation is not None else None)
|
|
1215
1208
|
# preceding_schema_version to be set by the caller
|
|
1216
1209
|
return schema.TableSchemaVersionMd(
|
|
1217
1210
|
schema_version=self.schema_version, preceding_schema_version=preceding_schema_version,
|
|
1218
|
-
columns=column_md, num_retained_versions=self.num_retained_versions, comment=self.comment
|
|
1211
|
+
columns=column_md, num_retained_versions=self.num_retained_versions, comment=self.comment,
|
|
1212
|
+
media_validation=self.media_validation.name.lower())
|
|
1219
1213
|
|
|
1220
1214
|
def as_dict(self) -> dict:
|
|
1221
1215
|
return {'id': str(self.id), 'effective_version': self.effective_version}
|
|
@@ -81,13 +81,13 @@ class TableVersionPath:
|
|
|
81
81
|
return None
|
|
82
82
|
return self.base.find_tbl_version(id)
|
|
83
83
|
|
|
84
|
-
def
|
|
84
|
+
def get_column_ref(self, col_name: str) -> exprs.ColumnRef:
|
|
85
85
|
"""Return a ColumnRef for the given column name."""
|
|
86
86
|
from pixeltable.exprs import ColumnRef
|
|
87
87
|
if col_name not in self.tbl_version.cols_by_name:
|
|
88
88
|
if self.base is None:
|
|
89
89
|
raise AttributeError(f'Column {col_name} unknown')
|
|
90
|
-
return
|
|
90
|
+
return self.base.get_column_ref(col_name)
|
|
91
91
|
col = self.tbl_version.cols_by_name[col_name]
|
|
92
92
|
return ColumnRef(col)
|
|
93
93
|
|
pixeltable/catalog/view.py
CHANGED
|
@@ -2,24 +2,21 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import inspect
|
|
4
4
|
import logging
|
|
5
|
-
from typing import TYPE_CHECKING, Any,
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional
|
|
6
6
|
from uuid import UUID
|
|
7
7
|
|
|
8
8
|
import sqlalchemy.orm as orm
|
|
9
9
|
|
|
10
|
-
import pixeltable.catalog as catalog
|
|
11
10
|
import pixeltable.exceptions as excs
|
|
12
|
-
import pixeltable.exprs as exprs
|
|
13
|
-
import pixeltable.func as func
|
|
14
11
|
import pixeltable.metadata.schema as md_schema
|
|
12
|
+
import pixeltable.type_system as ts
|
|
13
|
+
from pixeltable import catalog, exprs, func
|
|
15
14
|
from pixeltable.env import Env
|
|
16
|
-
from pixeltable.exceptions import Error
|
|
17
15
|
from pixeltable.iterators import ComponentIterator
|
|
18
|
-
from pixeltable.type_system import IntType, InvalidType
|
|
19
16
|
|
|
20
17
|
from .catalog import Catalog
|
|
21
18
|
from .column import Column
|
|
22
|
-
from .globals import _POS_COLUMN_NAME, UpdateStatus
|
|
19
|
+
from .globals import _POS_COLUMN_NAME, UpdateStatus, MediaValidation
|
|
23
20
|
from .table import Table
|
|
24
21
|
from .table_version import TableVersion
|
|
25
22
|
from .table_version_path import TableVersionPath
|
|
@@ -52,9 +49,10 @@ class View(Table):
|
|
|
52
49
|
|
|
53
50
|
@classmethod
|
|
54
51
|
def _create(
|
|
55
|
-
cls, dir_id: UUID, name: str, base: TableVersionPath, additional_columns:
|
|
52
|
+
cls, dir_id: UUID, name: str, base: TableVersionPath, additional_columns: dict[str, Any],
|
|
56
53
|
predicate: Optional['pxt.exprs.Expr'], is_snapshot: bool, num_retained_versions: int, comment: str,
|
|
57
|
-
|
|
54
|
+
media_validation: MediaValidation,
|
|
55
|
+
iterator_cls: Optional[type[ComponentIterator]], iterator_args: Optional[dict]
|
|
58
56
|
) -> View:
|
|
59
57
|
columns = cls._create_columns(additional_columns)
|
|
60
58
|
cls._verify_schema(columns)
|
|
@@ -92,17 +90,17 @@ class View(Table):
|
|
|
92
90
|
func.Parameter(param_name, param_type, kind=inspect.Parameter.POSITIONAL_OR_KEYWORD)
|
|
93
91
|
for param_name, param_type in iterator_cls.input_schema().items()
|
|
94
92
|
]
|
|
95
|
-
sig = func.Signature(InvalidType(), params)
|
|
93
|
+
sig = func.Signature(ts.InvalidType(), params)
|
|
96
94
|
from pixeltable.exprs import FunctionCall
|
|
97
95
|
FunctionCall.normalize_args(iterator_cls.__name__, sig, bound_args)
|
|
98
96
|
except TypeError as e:
|
|
99
|
-
raise Error(f'Cannot instantiate iterator with given arguments: {e}')
|
|
97
|
+
raise excs.Error(f'Cannot instantiate iterator with given arguments: {e}')
|
|
100
98
|
|
|
101
99
|
# prepend pos and output_schema columns to cols:
|
|
102
100
|
# a component view exposes the pos column of its rowid;
|
|
103
101
|
# we create that column here, so it gets assigned a column id;
|
|
104
102
|
# stored=False: it is not stored separately (it's already stored as part of the rowid)
|
|
105
|
-
iterator_cols = [Column(_POS_COLUMN_NAME, IntType(), stored=False)]
|
|
103
|
+
iterator_cols = [Column(_POS_COLUMN_NAME, ts.IntType(), stored=False)]
|
|
106
104
|
output_dict, unstored_cols = iterator_cls.output_schema(**bound_args)
|
|
107
105
|
iterator_cols.extend([
|
|
108
106
|
Column(col_name, col_type, stored=col_name not in unstored_cols)
|
|
@@ -112,12 +110,12 @@ class View(Table):
|
|
|
112
110
|
iterator_col_names = {col.name for col in iterator_cols}
|
|
113
111
|
for col in columns:
|
|
114
112
|
if col.name in iterator_col_names:
|
|
115
|
-
raise Error(f'Duplicate name: column {col.name} is already present in the iterator output schema')
|
|
113
|
+
raise excs.Error(f'Duplicate name: column {col.name} is already present in the iterator output schema')
|
|
116
114
|
columns = iterator_cols + columns
|
|
117
115
|
|
|
118
116
|
with orm.Session(Env.get().engine, future=True) as session:
|
|
119
117
|
from pixeltable.exprs import InlineDict
|
|
120
|
-
iterator_args_expr = InlineDict(iterator_args) if iterator_args is not None else None
|
|
118
|
+
iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
|
|
121
119
|
iterator_class_fqn = f'{iterator_cls.__module__}.{iterator_cls.__name__}' if iterator_cls is not None \
|
|
122
120
|
else None
|
|
123
121
|
base_version_path = cls._get_snapshot_path(base) if is_snapshot else base
|
|
@@ -142,7 +140,8 @@ class View(Table):
|
|
|
142
140
|
iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None)
|
|
143
141
|
|
|
144
142
|
id, tbl_version = TableVersion.create(
|
|
145
|
-
session, dir_id, name, columns, num_retained_versions, comment,
|
|
143
|
+
session, dir_id, name, columns, num_retained_versions, comment, media_validation=media_validation,
|
|
144
|
+
base_path=base_version_path, view_md=view_md)
|
|
146
145
|
if tbl_version is None:
|
|
147
146
|
# this is purely a snapshot: we use the base's tbl version path
|
|
148
147
|
view = cls(id, dir_id, name, base_version_path, base.tbl_id(), snapshot_only=True)
|
|
@@ -168,11 +167,11 @@ class View(Table):
|
|
|
168
167
|
|
|
169
168
|
@classmethod
|
|
170
169
|
def _verify_column(
|
|
171
|
-
cls, col: Column, existing_column_names:
|
|
170
|
+
cls, col: Column, existing_column_names: set[str], existing_query_names: Optional[set[str]] = None
|
|
172
171
|
) -> None:
|
|
173
172
|
# make sure that columns are nullable or have a default
|
|
174
173
|
if not col.col_type.nullable and not col.is_computed:
|
|
175
|
-
raise Error(f'Column {col.name}: non-computed columns in views must be nullable')
|
|
174
|
+
raise excs.Error(f'Column {col.name}: non-computed columns in views must be nullable')
|
|
176
175
|
super()._verify_column(col, existing_column_names, existing_query_names)
|
|
177
176
|
|
|
178
177
|
@classmethod
|
|
@@ -217,7 +216,7 @@ class View(Table):
|
|
|
217
216
|
|
|
218
217
|
def insert(
|
|
219
218
|
self, rows: Optional[Iterable[dict[str, Any]]] = None, /, *, print_stats: bool = False,
|
|
220
|
-
|
|
219
|
+
on_error: Literal['abort', 'ignore'] = 'abort', **kwargs: Any
|
|
221
220
|
) -> UpdateStatus:
|
|
222
221
|
raise excs.Error(f'{self._display_name()} {self._name!r}: cannot insert into view')
|
|
223
222
|
|
pixeltable/dataframe.py
CHANGED
|
@@ -8,7 +8,7 @@ import logging
|
|
|
8
8
|
import mimetypes
|
|
9
9
|
import traceback
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import TYPE_CHECKING, Any, Callable,
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterator, Optional, Sequence, Union
|
|
12
12
|
|
|
13
13
|
import pandas as pd
|
|
14
14
|
import pandas.io.formats.style
|
|
@@ -34,14 +34,6 @@ __all__ = ['DataFrame']
|
|
|
34
34
|
_logger = logging.getLogger('pixeltable')
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
def _create_source_tag(file_path: str) -> str:
|
|
38
|
-
src_url = get_file_uri(Env.get().http_address, file_path)
|
|
39
|
-
mime = mimetypes.guess_type(src_url)[0]
|
|
40
|
-
# if mime is None, the attribute string would not be valid html.
|
|
41
|
-
mime_attr = f'type="{mime}"' if mime is not None else ''
|
|
42
|
-
return f'<source src="{src_url}" {mime_attr} />'
|
|
43
|
-
|
|
44
|
-
|
|
45
37
|
class DataFrameResultSet:
|
|
46
38
|
def __init__(self, rows: list[list[Any]], schema: dict[str, ColumnType]):
|
|
47
39
|
self._rows = rows
|
|
@@ -77,7 +69,7 @@ class DataFrameResultSet:
|
|
|
77
69
|
def to_pandas(self) -> pd.DataFrame:
|
|
78
70
|
return pd.DataFrame.from_records(self._rows, columns=self._col_names)
|
|
79
71
|
|
|
80
|
-
def _row_to_dict(self, row_idx: int) ->
|
|
72
|
+
def _row_to_dict(self, row_idx: int) -> dict[str, Any]:
|
|
81
73
|
return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
|
|
82
74
|
|
|
83
75
|
def __getitem__(self, index: Any) -> Any:
|
|
@@ -111,22 +103,22 @@ class DataFrameResultSet:
|
|
|
111
103
|
# def __init__(self, tbl: catalog.TableVersion):
|
|
112
104
|
# self.tbl = tbl
|
|
113
105
|
# # output of the SQL scan stage
|
|
114
|
-
# self.sql_scan_output_exprs:
|
|
106
|
+
# self.sql_scan_output_exprs: list[exprs.Expr] = []
|
|
115
107
|
# # output of the agg stage
|
|
116
|
-
# self.agg_output_exprs:
|
|
108
|
+
# self.agg_output_exprs: list[exprs.Expr] = []
|
|
117
109
|
# # Where clause of the Select stmt of the SQL scan stage
|
|
118
110
|
# self.sql_where_clause: Optional[sql.ClauseElement] = None
|
|
119
111
|
# # filter predicate applied to input rows of the SQL scan stage
|
|
120
112
|
# self.filter: Optional[exprs.Predicate] = None
|
|
121
113
|
# self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
|
|
122
|
-
# self.agg_fn_calls:
|
|
114
|
+
# self.agg_fn_calls: list[exprs.FunctionCall] = [] # derived from unique_exprs
|
|
123
115
|
# self.has_frame_col: bool = False # True if we're referencing the frame col
|
|
124
116
|
#
|
|
125
117
|
# self.evaluator: Optional[exprs.Evaluator] = None
|
|
126
|
-
# self.sql_scan_eval_ctx:
|
|
127
|
-
# self.agg_eval_ctx:
|
|
128
|
-
# self.filter_eval_ctx:
|
|
129
|
-
# self.group_by_eval_ctx:
|
|
118
|
+
# self.sql_scan_eval_ctx: list[exprs.Expr] = [] # needed to materialize output of SQL scan stage
|
|
119
|
+
# self.agg_eval_ctx: list[exprs.Expr] = [] # needed to materialize output of agg stage
|
|
120
|
+
# self.filter_eval_ctx: list[exprs.Expr] = []
|
|
121
|
+
# self.group_by_eval_ctx: list[exprs.Expr] = []
|
|
130
122
|
#
|
|
131
123
|
# def finalize_exec(self) -> None:
|
|
132
124
|
# """
|
|
@@ -142,11 +134,11 @@ class DataFrame:
|
|
|
142
134
|
def __init__(
|
|
143
135
|
self,
|
|
144
136
|
tbl: catalog.TableVersionPath,
|
|
145
|
-
select_list: Optional[
|
|
137
|
+
select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]] = None,
|
|
146
138
|
where_clause: Optional[exprs.Expr] = None,
|
|
147
|
-
group_by_clause: Optional[
|
|
139
|
+
group_by_clause: Optional[list[exprs.Expr]] = None,
|
|
148
140
|
grouping_tbl: Optional[catalog.TableVersion] = None,
|
|
149
|
-
order_by_clause: Optional[
|
|
141
|
+
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None, # list[(expr, asc)]
|
|
150
142
|
limit: Optional[int] = None,
|
|
151
143
|
):
|
|
152
144
|
self.tbl = tbl
|
|
@@ -174,7 +166,7 @@ class DataFrame:
|
|
|
174
166
|
@classmethod
|
|
175
167
|
def _select_list_check_rep(
|
|
176
168
|
cls,
|
|
177
|
-
select_list: Optional[
|
|
169
|
+
select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
|
|
178
170
|
) -> None:
|
|
179
171
|
"""Validate basic select list types."""
|
|
180
172
|
if select_list is None: # basic check for valid select list
|
|
@@ -371,15 +363,10 @@ class DataFrame:
|
|
|
371
363
|
group_by_clause=group_by_clause, grouping_tbl=self.grouping_tbl,
|
|
372
364
|
order_by_clause=order_by_clause, limit=self.limit_val)
|
|
373
365
|
|
|
374
|
-
def
|
|
375
|
-
return self._collect()
|
|
376
|
-
|
|
377
|
-
def _collect(self, conn: Optional[sql.engine.Connection] = None) -> DataFrameResultSet:
|
|
366
|
+
def _output_row_iterator(self, conn: Optional[sql.engine.Connection] = None) -> Iterator[list]:
|
|
378
367
|
try:
|
|
379
|
-
result_rows = []
|
|
380
368
|
for data_row in self._exec(conn):
|
|
381
|
-
|
|
382
|
-
result_rows.append(result_row)
|
|
369
|
+
yield [data_row[e.slot_idx] for e in self._select_list_exprs]
|
|
383
370
|
except excs.ExprEvalError as e:
|
|
384
371
|
msg = f'In row {e.row_num} the {e.expr_msg} encountered exception ' f'{type(e.exc).__name__}:\n{str(e.exc)}'
|
|
385
372
|
if len(e.input_vals) > 0:
|
|
@@ -399,7 +386,11 @@ class DataFrame:
|
|
|
399
386
|
except sql.exc.DBAPIError as e:
|
|
400
387
|
raise excs.Error(f'Error during SQL execution:\n{e}')
|
|
401
388
|
|
|
402
|
-
|
|
389
|
+
def collect(self) -> DataFrameResultSet:
|
|
390
|
+
return self._collect()
|
|
391
|
+
|
|
392
|
+
def _collect(self, conn: Optional[sql.engine.Connection] = None) -> DataFrameResultSet:
|
|
393
|
+
return DataFrameResultSet(list(self._output_row_iterator(conn)), self.schema)
|
|
403
394
|
|
|
404
395
|
def count(self) -> int:
|
|
405
396
|
from pixeltable.plan import Planner
|
|
@@ -412,8 +403,8 @@ class DataFrame:
|
|
|
412
403
|
|
|
413
404
|
def _description(self) -> pd.DataFrame:
|
|
414
405
|
"""see DataFrame.describe()"""
|
|
415
|
-
heading_vals:
|
|
416
|
-
info_vals:
|
|
406
|
+
heading_vals: list[str] = []
|
|
407
|
+
info_vals: list[str] = []
|
|
417
408
|
if self.select_list is not None:
|
|
418
409
|
assert len(self.select_list) > 0
|
|
419
410
|
heading_vals.append('Select')
|
|
@@ -498,7 +489,7 @@ class DataFrame:
|
|
|
498
489
|
|
|
499
490
|
# check user provided names do not conflict among themselves
|
|
500
491
|
# or with auto-generated ones
|
|
501
|
-
seen:
|
|
492
|
+
seen: set[str] = set()
|
|
502
493
|
_, names = DataFrame._normalize_select_list(self.tbl, select_list)
|
|
503
494
|
for name in names:
|
|
504
495
|
if name in seen:
|
|
@@ -541,7 +532,7 @@ class DataFrame:
|
|
|
541
532
|
if self.group_by_clause is not None:
|
|
542
533
|
raise excs.Error(f'Group-by already specified')
|
|
543
534
|
grouping_tbl: Optional[catalog.TableVersion] = None
|
|
544
|
-
group_by_clause: Optional[
|
|
535
|
+
group_by_clause: Optional[list[exprs.Expr]] = None
|
|
545
536
|
for item in grouping_items:
|
|
546
537
|
if isinstance(item, catalog.Table):
|
|
547
538
|
if len(grouping_items) > 1:
|
|
@@ -619,7 +610,7 @@ class DataFrame:
|
|
|
619
610
|
def __getitem__(self, index: Union[exprs.Expr, Sequence[exprs.Expr]]) -> DataFrame:
|
|
620
611
|
"""
|
|
621
612
|
Allowed:
|
|
622
|
-
- [
|
|
613
|
+
- [list[Expr]]/[tuple[Expr]]: setting the select list
|
|
623
614
|
- [Expr]: setting a single-col select list
|
|
624
615
|
"""
|
|
625
616
|
if isinstance(index, exprs.Expr):
|
|
@@ -628,7 +619,7 @@ class DataFrame:
|
|
|
628
619
|
return self.select(*index)
|
|
629
620
|
raise TypeError(f'Invalid index type: {type(index)}')
|
|
630
621
|
|
|
631
|
-
def as_dict(self) ->
|
|
622
|
+
def as_dict(self) -> dict[str, Any]:
|
|
632
623
|
"""
|
|
633
624
|
Returns:
|
|
634
625
|
Dictionary representing this dataframe.
|
|
@@ -650,7 +641,7 @@ class DataFrame:
|
|
|
650
641
|
return d
|
|
651
642
|
|
|
652
643
|
@classmethod
|
|
653
|
-
def from_dict(cls, d:
|
|
644
|
+
def from_dict(cls, d: dict[str, Any]) -> 'DataFrame':
|
|
654
645
|
tbl = catalog.TableVersionPath.from_dict(d['tbl'])
|
|
655
646
|
select_list = [(exprs.Expr.from_dict(e), name) for e, name in d['select_list']] \
|
|
656
647
|
if d['select_list'] is not None else None
|
pixeltable/env.py
CHANGED
|
@@ -494,18 +494,25 @@ class Env:
|
|
|
494
494
|
self.__register_package('anthropic')
|
|
495
495
|
self.__register_package('boto3')
|
|
496
496
|
self.__register_package('datasets')
|
|
497
|
+
self.__register_package('fiftyone')
|
|
497
498
|
self.__register_package('fireworks', library_name='fireworks-ai')
|
|
499
|
+
self.__register_package('huggingface_hub', library_name='huggingface-hub')
|
|
498
500
|
self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
|
|
501
|
+
self.__register_package('llama_cpp', library_name='llama-cpp-python')
|
|
499
502
|
self.__register_package('mistralai')
|
|
500
503
|
self.__register_package('mistune')
|
|
504
|
+
self.__register_package('ollama')
|
|
501
505
|
self.__register_package('openai')
|
|
502
506
|
self.__register_package('openpyxl')
|
|
503
507
|
self.__register_package('pyarrow')
|
|
508
|
+
self.__register_package('replicate')
|
|
509
|
+
self.__register_package('sentencepiece')
|
|
504
510
|
self.__register_package('sentence_transformers', library_name='sentence-transformers')
|
|
505
511
|
self.__register_package('spacy')
|
|
506
512
|
self.__register_package('tiktoken')
|
|
507
513
|
self.__register_package('together')
|
|
508
514
|
self.__register_package('torch')
|
|
515
|
+
self.__register_package('torchaudio')
|
|
509
516
|
self.__register_package('torchvision')
|
|
510
517
|
self.__register_package('transformers')
|
|
511
518
|
self.__register_package('whisper', library_name='openai-whisper')
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -6,6 +6,5 @@ from .exec_context import ExecContext
|
|
|
6
6
|
from .exec_node import ExecNode
|
|
7
7
|
from .expr_eval_node import ExprEvalNode
|
|
8
8
|
from .in_memory_data_node import InMemoryDataNode
|
|
9
|
-
from .media_validation_node import MediaValidationNode
|
|
10
9
|
from .row_update_node import RowUpdateNode
|
|
11
10
|
from .sql_node import SqlLookupNode, SqlScanNode, SqlAggregationNode, SqlNode
|
|
@@ -2,11 +2,12 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
|
-
from typing import Iterable,
|
|
5
|
+
from typing import Any, Iterable, Iterator, Optional, cast
|
|
6
6
|
|
|
7
7
|
import pixeltable.catalog as catalog
|
|
8
8
|
import pixeltable.exceptions as excs
|
|
9
9
|
import pixeltable.exprs as exprs
|
|
10
|
+
|
|
10
11
|
from .data_row_batch import DataRowBatch
|
|
11
12
|
from .exec_node import ExecNode
|
|
12
13
|
|
|
@@ -28,13 +29,15 @@ class AggregationNode(ExecNode):
|
|
|
28
29
|
self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: Optional[list[exprs.Expr]],
|
|
29
30
|
agg_fn_calls: list[exprs.FunctionCall], input_exprs: Iterable[exprs.Expr], input: ExecNode
|
|
30
31
|
):
|
|
31
|
-
|
|
32
|
+
output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
|
|
33
|
+
output_exprs.extend(agg_fn_calls)
|
|
34
|
+
super().__init__(row_builder, output_exprs, input_exprs, input)
|
|
32
35
|
self.input = input
|
|
33
36
|
self.group_by = group_by
|
|
34
37
|
self.input_exprs = list(input_exprs)
|
|
35
38
|
self.agg_fn_eval_ctx = row_builder.create_eval_ctx(agg_fn_calls, exclude=self.input_exprs)
|
|
36
39
|
# we need to make sure to refer to the same exprs that RowBuilder.eval() will use
|
|
37
|
-
self.agg_fn_calls = self.agg_fn_eval_ctx.target_exprs
|
|
40
|
+
self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
|
|
38
41
|
# create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
|
|
39
42
|
self.output_batch = DataRowBatch(tbl, row_builder, 0)
|
|
40
43
|
|