pixeltable 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/column.py +37 -11
- pixeltable/catalog/globals.py +18 -0
- pixeltable/catalog/insertable_table.py +6 -4
- pixeltable/catalog/table.py +19 -3
- pixeltable/catalog/table_version.py +34 -14
- pixeltable/catalog/view.py +16 -17
- pixeltable/dataframe.py +7 -8
- pixeltable/env.py +5 -0
- pixeltable/exec/__init__.py +0 -1
- pixeltable/exec/aggregation_node.py +6 -3
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/data_row_batch.py +2 -19
- pixeltable/exec/exec_node.py +2 -1
- pixeltable/exec/expr_eval_node.py +17 -10
- pixeltable/exec/in_memory_data_node.py +6 -3
- pixeltable/exec/sql_node.py +24 -25
- pixeltable/exprs/arithmetic_expr.py +3 -1
- pixeltable/exprs/array_slice.py +7 -7
- pixeltable/exprs/column_property_ref.py +37 -10
- pixeltable/exprs/column_ref.py +93 -14
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +8 -7
- pixeltable/exprs/data_row.py +27 -18
- pixeltable/exprs/expr.py +53 -52
- pixeltable/exprs/expr_set.py +5 -0
- pixeltable/exprs/function_call.py +32 -16
- pixeltable/exprs/globals.py +4 -1
- pixeltable/exprs/in_predicate.py +8 -7
- pixeltable/exprs/inline_expr.py +4 -4
- pixeltable/exprs/is_null.py +4 -4
- pixeltable/exprs/json_mapper.py +11 -12
- pixeltable/exprs/json_path.py +5 -10
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +5 -4
- pixeltable/exprs/object_ref.py +2 -1
- pixeltable/exprs/row_builder.py +88 -36
- pixeltable/exprs/rowid_ref.py +12 -11
- pixeltable/exprs/similarity_expr.py +12 -7
- pixeltable/exprs/sql_element_cache.py +7 -5
- pixeltable/exprs/type_cast.py +8 -6
- pixeltable/exprs/variable.py +5 -4
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/function.py +11 -10
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/globals.py +5 -7
- pixeltable/functions/huggingface.py +19 -20
- pixeltable/functions/llama_cpp.py +106 -0
- pixeltable/functions/ollama.py +147 -0
- pixeltable/functions/replicate.py +72 -0
- pixeltable/functions/string.py +9 -0
- pixeltable/globals.py +12 -20
- pixeltable/index/btree.py +16 -3
- pixeltable/index/embedding_index.py +4 -4
- pixeltable/io/__init__.py +1 -2
- pixeltable/io/fiftyone.py +178 -0
- pixeltable/io/globals.py +96 -2
- pixeltable/iterators/base.py +3 -2
- pixeltable/iterators/document.py +1 -1
- pixeltable/iterators/video.py +120 -63
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_21.py +34 -0
- pixeltable/metadata/converters/util.py +45 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +8 -0
- pixeltable/plan.py +16 -14
- pixeltable/py.typed +0 -0
- pixeltable/store.py +7 -2
- pixeltable/tool/create_test_video.py +1 -1
- pixeltable/tool/embed_udf.py +1 -1
- pixeltable/tool/mypy_plugin.py +28 -5
- pixeltable/type_system.py +17 -1
- pixeltable/utils/documents.py +15 -1
- pixeltable/utils/formatter.py +9 -10
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/METADATA +46 -10
- pixeltable-0.2.22.dist-info/RECORD +153 -0
- pixeltable/exec/media_validation_node.py +0 -43
- pixeltable-0.2.21.dist-info/RECORD +0 -148
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
pixeltable/__version__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
# These version placeholders will be replaced during build.
|
|
2
|
-
__version__ = "0.2.
|
|
3
|
-
__version_tuple__ = (0, 2,
|
|
2
|
+
__version__ = "0.2.22"
|
|
3
|
+
__version_tuple__ = (0, 2, 22)
|
pixeltable/catalog/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from .catalog import Catalog
|
|
2
2
|
from .column import Column
|
|
3
3
|
from .dir import Dir
|
|
4
|
-
from .globals import UpdateStatus, is_valid_identifier, is_valid_path
|
|
4
|
+
from .globals import UpdateStatus, is_valid_identifier, is_valid_path, MediaValidation
|
|
5
5
|
from .insertable_table import InsertableTable
|
|
6
6
|
from .named_function import NamedFunction
|
|
7
7
|
from .path import Path
|
pixeltable/catalog/column.py
CHANGED
|
@@ -8,24 +8,43 @@ import sqlalchemy as sql
|
|
|
8
8
|
import pixeltable.exceptions as excs
|
|
9
9
|
import pixeltable.type_system as ts
|
|
10
10
|
from pixeltable import exprs
|
|
11
|
-
|
|
12
|
-
from .globals import is_valid_identifier
|
|
11
|
+
from .globals import is_valid_identifier, MediaValidation
|
|
13
12
|
|
|
14
13
|
if TYPE_CHECKING:
|
|
15
14
|
from .table_version import TableVersion
|
|
16
15
|
|
|
17
16
|
_logger = logging.getLogger('pixeltable')
|
|
18
17
|
|
|
18
|
+
|
|
19
19
|
class Column:
|
|
20
20
|
"""Representation of a column in the schema of a Table/DataFrame.
|
|
21
21
|
|
|
22
22
|
A Column contains all the metadata necessary for executing queries and updates against a particular version of a
|
|
23
23
|
table/view.
|
|
24
24
|
"""
|
|
25
|
+
name: str
|
|
26
|
+
id: Optional[int]
|
|
27
|
+
col_type: ts.ColumnType
|
|
28
|
+
stored: bool
|
|
29
|
+
is_pk: bool
|
|
30
|
+
_media_validation: Optional[MediaValidation] # if not set, TableVersion.media_validation applies
|
|
31
|
+
schema_version_add: Optional[int]
|
|
32
|
+
schema_version_drop: Optional[int]
|
|
33
|
+
_records_errors: Optional[bool]
|
|
34
|
+
sa_col: Optional[sql.schema.Column]
|
|
35
|
+
sa_col_type: Optional[sql.sqltypes.TypeEngine]
|
|
36
|
+
sa_errormsg_col: Optional[sql.schema.Column]
|
|
37
|
+
sa_errortype_col: Optional[sql.schema.Column]
|
|
38
|
+
compute_func: Optional[Callable]
|
|
39
|
+
_value_expr: Optional[exprs.Expr]
|
|
40
|
+
value_expr_dict: Optional[dict[str, Any]]
|
|
41
|
+
dependent_cols: set[Column]
|
|
42
|
+
tbl: Optional[TableVersion]
|
|
43
|
+
|
|
25
44
|
def __init__(
|
|
26
45
|
self, name: Optional[str], col_type: Optional[ts.ColumnType] = None,
|
|
27
46
|
computed_with: Optional[Union[exprs.Expr, Callable]] = None,
|
|
28
|
-
is_pk: bool = False, stored: bool = True,
|
|
47
|
+
is_pk: bool = False, stored: bool = True, media_validation: Optional[MediaValidation] = None,
|
|
29
48
|
col_id: Optional[int] = None, schema_version_add: Optional[int] = None,
|
|
30
49
|
schema_version_drop: Optional[int] = None, sa_col_type: Optional[sql.sqltypes.TypeEngine] = None,
|
|
31
50
|
records_errors: Optional[bool] = None, value_expr_dict: Optional[dict[str, Any]] = None,
|
|
@@ -61,8 +80,8 @@ class Column:
|
|
|
61
80
|
if col_type is None and computed_with is None:
|
|
62
81
|
raise excs.Error(f'Column `{name}`: col_type is required if computed_with is not specified')
|
|
63
82
|
|
|
64
|
-
self._value_expr
|
|
65
|
-
self.compute_func
|
|
83
|
+
self._value_expr = None
|
|
84
|
+
self.compute_func = None
|
|
66
85
|
self.value_expr_dict = value_expr_dict
|
|
67
86
|
if computed_with is not None:
|
|
68
87
|
value_expr = exprs.Expr.from_object(computed_with)
|
|
@@ -86,24 +105,24 @@ class Column:
|
|
|
86
105
|
assert self.col_type is not None
|
|
87
106
|
|
|
88
107
|
self.stored = stored
|
|
89
|
-
self.dependent_cols
|
|
108
|
+
self.dependent_cols = set() # cols with value_exprs that reference us; set by TableVersion
|
|
90
109
|
self.id = col_id
|
|
91
110
|
self.is_pk = is_pk
|
|
111
|
+
self._media_validation = media_validation
|
|
92
112
|
self.schema_version_add = schema_version_add
|
|
93
113
|
self.schema_version_drop = schema_version_drop
|
|
94
114
|
|
|
95
115
|
self._records_errors = records_errors
|
|
96
116
|
|
|
97
117
|
# column in the stored table for the values of this Column
|
|
98
|
-
self.sa_col
|
|
118
|
+
self.sa_col = None
|
|
99
119
|
self.sa_col_type = sa_col_type
|
|
100
120
|
|
|
101
121
|
# computed cols also have storage columns for the exception string and type
|
|
102
|
-
self.sa_errormsg_col
|
|
103
|
-
self.sa_errortype_col
|
|
122
|
+
self.sa_errormsg_col = None
|
|
123
|
+
self.sa_errortype_col = None
|
|
104
124
|
|
|
105
|
-
|
|
106
|
-
self.tbl: Optional[TableVersion] = None # set by owning TableVersion
|
|
125
|
+
self.tbl = None # set by owning TableVersion
|
|
107
126
|
|
|
108
127
|
@property
|
|
109
128
|
def value_expr(self) -> Optional[exprs.Expr]:
|
|
@@ -160,6 +179,13 @@ class Column:
|
|
|
160
179
|
assert self.tbl is not None
|
|
161
180
|
return f'{self.tbl.name}.{self.name}'
|
|
162
181
|
|
|
182
|
+
@property
|
|
183
|
+
def media_validation(self) -> MediaValidation:
|
|
184
|
+
if self._media_validation is not None:
|
|
185
|
+
return self._media_validation
|
|
186
|
+
assert self.tbl is not None
|
|
187
|
+
return self.tbl.media_validation
|
|
188
|
+
|
|
163
189
|
def source(self) -> None:
|
|
164
190
|
"""
|
|
165
191
|
If this is a computed col and the top-level expr is a function call, print the source, if possible.
|
pixeltable/catalog/globals.py
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
1
2
|
import dataclasses
|
|
3
|
+
import enum
|
|
2
4
|
import itertools
|
|
3
5
|
import logging
|
|
4
6
|
from typing import Optional
|
|
5
7
|
|
|
8
|
+
import pixeltable.exceptions as excs
|
|
9
|
+
|
|
6
10
|
_logger = logging.getLogger('pixeltable')
|
|
7
11
|
|
|
8
12
|
# name of the position column in a component view
|
|
@@ -34,6 +38,20 @@ class UpdateStatus:
|
|
|
34
38
|
self.cols_with_excs = list(dict.fromkeys(self.cols_with_excs + other.cols_with_excs))
|
|
35
39
|
return self
|
|
36
40
|
|
|
41
|
+
|
|
42
|
+
class MediaValidation(enum.Enum):
|
|
43
|
+
ON_READ = 0
|
|
44
|
+
ON_WRITE = 1
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def validated(cls, name: str, error_prefix: str) -> MediaValidation:
|
|
48
|
+
try:
|
|
49
|
+
return cls[name.upper()]
|
|
50
|
+
except KeyError:
|
|
51
|
+
val_strs = ', '.join(f'{s.lower()!r}' for s in cls.__members__.keys())
|
|
52
|
+
raise excs.Error(f'{error_prefix} must be one of: [{val_strs}]')
|
|
53
|
+
|
|
54
|
+
|
|
37
55
|
def is_valid_identifier(name: str) -> bool:
|
|
38
56
|
return name.isidentifier() and not name.startswith('_')
|
|
39
57
|
|
|
@@ -13,7 +13,7 @@ from pixeltable.env import Env
|
|
|
13
13
|
from pixeltable.utils.filecache import FileCache
|
|
14
14
|
|
|
15
15
|
from .catalog import Catalog
|
|
16
|
-
from .globals import UpdateStatus
|
|
16
|
+
from .globals import UpdateStatus, MediaValidation
|
|
17
17
|
from .table import Table
|
|
18
18
|
from .table_version import TableVersion
|
|
19
19
|
from .table_version_path import TableVersionPath
|
|
@@ -35,8 +35,8 @@ class InsertableTable(Table):
|
|
|
35
35
|
# MODULE-LOCAL, NOT PUBLIC
|
|
36
36
|
@classmethod
|
|
37
37
|
def _create(
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
cls, dir_id: UUID, name: str, schema: dict[str, ts.ColumnType], df: Optional[pxt.DataFrame],
|
|
39
|
+
primary_key: List[str], num_retained_versions: int, comment: str, media_validation: MediaValidation
|
|
40
40
|
) -> InsertableTable:
|
|
41
41
|
columns = cls._create_columns(schema)
|
|
42
42
|
cls._verify_schema(columns)
|
|
@@ -50,7 +50,9 @@ class InsertableTable(Table):
|
|
|
50
50
|
col.is_pk = True
|
|
51
51
|
|
|
52
52
|
with orm.Session(Env.get().engine, future=True) as session:
|
|
53
|
-
_, tbl_version = TableVersion.create(
|
|
53
|
+
_, tbl_version = TableVersion.create(
|
|
54
|
+
session, dir_id, name, columns, num_retained_versions=num_retained_versions, comment=comment,
|
|
55
|
+
media_validation=media_validation)
|
|
54
56
|
tbl = cls(dir_id, tbl_version)
|
|
55
57
|
# TODO We need to commit before doing the insertion, in order to avoid a primary key (version) collision
|
|
56
58
|
# when the table metadata gets updated. Once we have a notion of user-defined transactions in
|
pixeltable/catalog/table.py
CHANGED
|
@@ -24,7 +24,7 @@ import pixeltable.type_system as ts
|
|
|
24
24
|
from pixeltable.utils.filecache import FileCache
|
|
25
25
|
|
|
26
26
|
from .column import Column
|
|
27
|
-
from .globals import _ROWID_COLUMN_NAME, UpdateStatus, is_system_column_name, is_valid_identifier
|
|
27
|
+
from .globals import _ROWID_COLUMN_NAME, UpdateStatus, is_system_column_name, is_valid_identifier, MediaValidation
|
|
28
28
|
from .schema_object import SchemaObject
|
|
29
29
|
from .table_version import TableVersion
|
|
30
30
|
from .table_version_path import TableVersionPath
|
|
@@ -91,6 +91,7 @@ class Table(SchemaObject):
|
|
|
91
91
|
'num_retained_versions': 10,
|
|
92
92
|
'is_view': False,
|
|
93
93
|
'is_snapshot': False,
|
|
94
|
+
'media_validation': 'on_write',
|
|
94
95
|
}
|
|
95
96
|
```
|
|
96
97
|
"""
|
|
@@ -101,6 +102,7 @@ class Table(SchemaObject):
|
|
|
101
102
|
md['schema_version'] = self._tbl_version.schema_version
|
|
102
103
|
md['comment'] = self._comment
|
|
103
104
|
md['num_retained_versions'] = self._num_retained_versions
|
|
105
|
+
md['media_validation'] = self._media_validation.name.lower()
|
|
104
106
|
return md
|
|
105
107
|
|
|
106
108
|
@property
|
|
@@ -244,6 +246,10 @@ class Table(SchemaObject):
|
|
|
244
246
|
def _num_retained_versions(self):
|
|
245
247
|
return self._tbl_version.num_retained_versions
|
|
246
248
|
|
|
249
|
+
@property
|
|
250
|
+
def _media_validation(self) -> MediaValidation:
|
|
251
|
+
return self._tbl_version.media_validation
|
|
252
|
+
|
|
247
253
|
def _description(self) -> pd.DataFrame:
|
|
248
254
|
cols = self._tbl_version_path.columns()
|
|
249
255
|
df = pd.DataFrame({
|
|
@@ -422,7 +428,7 @@ class Table(SchemaObject):
|
|
|
422
428
|
(on account of containing Python Callables or Exprs).
|
|
423
429
|
"""
|
|
424
430
|
assert isinstance(spec, dict)
|
|
425
|
-
valid_keys = {'type', 'value', 'stored'}
|
|
431
|
+
valid_keys = {'type', 'value', 'stored', 'media_validation'}
|
|
426
432
|
has_type = False
|
|
427
433
|
for k in spec.keys():
|
|
428
434
|
if k not in valid_keys:
|
|
@@ -449,6 +455,9 @@ class Table(SchemaObject):
|
|
|
449
455
|
if 'type' in spec:
|
|
450
456
|
raise excs.Error(f'Column {name}: "type" is redundant if value is a Pixeltable expression')
|
|
451
457
|
|
|
458
|
+
if 'media_validation' in spec:
|
|
459
|
+
_ = catalog.MediaValidation.validated(spec['media_validation'], f'Column {name}: media_validation')
|
|
460
|
+
|
|
452
461
|
if 'stored' in spec and not isinstance(spec['stored'], bool):
|
|
453
462
|
raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
|
|
454
463
|
if not has_type:
|
|
@@ -462,6 +471,7 @@ class Table(SchemaObject):
|
|
|
462
471
|
col_type: Optional[ts.ColumnType] = None
|
|
463
472
|
value_expr: Optional[exprs.Expr] = None
|
|
464
473
|
primary_key: Optional[bool] = None
|
|
474
|
+
media_validation: Optional[catalog.MediaValidation] = None
|
|
465
475
|
stored = True
|
|
466
476
|
|
|
467
477
|
if isinstance(spec, (ts.ColumnType, type, _GenericAlias)):
|
|
@@ -484,9 +494,15 @@ class Table(SchemaObject):
|
|
|
484
494
|
value_expr = value_expr.copy()
|
|
485
495
|
stored = spec.get('stored', True)
|
|
486
496
|
primary_key = spec.get('primary_key')
|
|
497
|
+
media_validation_str = spec.get('media_validation')
|
|
498
|
+
media_validation = (
|
|
499
|
+
catalog.MediaValidation[media_validation_str.upper()] if media_validation_str is not None
|
|
500
|
+
else None
|
|
501
|
+
)
|
|
487
502
|
|
|
488
503
|
column = Column(
|
|
489
|
-
name, col_type=col_type, computed_with=value_expr, stored=stored, is_pk=primary_key
|
|
504
|
+
name, col_type=col_type, computed_with=value_expr, stored=stored, is_pk=primary_key,
|
|
505
|
+
media_validation=media_validation)
|
|
490
506
|
columns.append(column)
|
|
491
507
|
return columns
|
|
492
508
|
|
|
@@ -26,7 +26,7 @@ from pixeltable.utils.media_store import MediaStore
|
|
|
26
26
|
|
|
27
27
|
from ..func.globals import resolve_symbol
|
|
28
28
|
from .column import Column
|
|
29
|
-
from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, UpdateStatus, is_valid_identifier
|
|
29
|
+
from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, UpdateStatus, is_valid_identifier, MediaValidation
|
|
30
30
|
|
|
31
31
|
if TYPE_CHECKING:
|
|
32
32
|
from pixeltable import exec, store
|
|
@@ -53,6 +53,7 @@ class TableVersion:
|
|
|
53
53
|
name: str
|
|
54
54
|
version: int
|
|
55
55
|
comment: str
|
|
56
|
+
media_validation: MediaValidation
|
|
56
57
|
num_retained_versions: int
|
|
57
58
|
schema_version: int
|
|
58
59
|
view_md: Optional[schema.ViewMd]
|
|
@@ -109,6 +110,7 @@ class TableVersion:
|
|
|
109
110
|
self.view_md = tbl_md.view_md # save this as-is, it's needed for _create_md()
|
|
110
111
|
is_view = tbl_md.view_md is not None
|
|
111
112
|
self.is_snapshot = (is_view and tbl_md.view_md.is_snapshot) or bool(is_snapshot)
|
|
113
|
+
self.media_validation = MediaValidation[schema_version_md.media_validation.upper()]
|
|
112
114
|
# a mutable TableVersion doesn't have a static version
|
|
113
115
|
self.effective_version = self.version if self.is_snapshot else None
|
|
114
116
|
|
|
@@ -182,7 +184,7 @@ class TableVersion:
|
|
|
182
184
|
@classmethod
|
|
183
185
|
def create(
|
|
184
186
|
cls, session: orm.Session, dir_id: UUID, name: str, cols: list[Column], num_retained_versions: int,
|
|
185
|
-
comment: str, base_path: Optional[pxt.catalog.TableVersionPath] = None,
|
|
187
|
+
comment: str, media_validation: MediaValidation, base_path: Optional[pxt.catalog.TableVersionPath] = None,
|
|
186
188
|
view_md: Optional[schema.ViewMd] = None
|
|
187
189
|
) -> tuple[UUID, Optional[TableVersion]]:
|
|
188
190
|
# assign ids
|
|
@@ -214,11 +216,17 @@ class TableVersion:
|
|
|
214
216
|
tbl_id=tbl_record.id, version=0, md=dataclasses.asdict(table_version_md))
|
|
215
217
|
|
|
216
218
|
# create schema.TableSchemaVersion
|
|
217
|
-
schema_col_md
|
|
219
|
+
schema_col_md: dict[int, schema.SchemaColumn] = {}
|
|
220
|
+
for pos, col in enumerate(cols):
|
|
221
|
+
md = schema.SchemaColumn(
|
|
222
|
+
pos=pos, name=col.name,
|
|
223
|
+
media_validation=col._media_validation.name.lower() if col._media_validation is not None else None)
|
|
224
|
+
schema_col_md[col.id] = md
|
|
218
225
|
|
|
219
226
|
schema_version_md = schema.TableSchemaVersionMd(
|
|
220
227
|
schema_version=0, preceding_schema_version=None, columns=schema_col_md,
|
|
221
|
-
num_retained_versions=num_retained_versions, comment=comment
|
|
228
|
+
num_retained_versions=num_retained_versions, comment=comment,
|
|
229
|
+
media_validation=media_validation.name.lower())
|
|
222
230
|
schema_version_record = schema.TableSchemaVersion(
|
|
223
231
|
tbl_id=tbl_record.id, schema_version=0, md=dataclasses.asdict(schema_version_md))
|
|
224
232
|
|
|
@@ -285,10 +293,15 @@ class TableVersion:
|
|
|
285
293
|
self.cols_by_name = {}
|
|
286
294
|
self.cols_by_id = {}
|
|
287
295
|
for col_md in tbl_md.column_md.values():
|
|
288
|
-
|
|
296
|
+
schema_col_md = schema_version_md.columns[col_md.id] if col_md.id in schema_version_md.columns else None
|
|
297
|
+
col_name = schema_col_md.name if schema_col_md is not None else None
|
|
298
|
+
media_val = (
|
|
299
|
+
MediaValidation[schema_col_md.media_validation.upper()]
|
|
300
|
+
if schema_col_md is not None and schema_col_md.media_validation is not None else None
|
|
301
|
+
)
|
|
289
302
|
col = Column(
|
|
290
303
|
col_id=col_md.id, name=col_name, col_type=ts.ColumnType.from_dict(col_md.col_type),
|
|
291
|
-
is_pk=col_md.is_pk, stored=col_md.stored,
|
|
304
|
+
is_pk=col_md.is_pk, stored=col_md.stored, media_validation=media_val,
|
|
292
305
|
schema_version_add=col_md.schema_version_add, schema_version_drop=col_md.schema_version_drop,
|
|
293
306
|
value_expr_dict=col_md.value_expr)
|
|
294
307
|
col.tbl = self
|
|
@@ -349,7 +362,8 @@ class TableVersion:
|
|
|
349
362
|
self.store_tbl = StoreTable(self)
|
|
350
363
|
|
|
351
364
|
def _update_md(
|
|
352
|
-
|
|
365
|
+
self, timestamp: float, conn: sql.engine.Connection, update_tbl_version: bool = True,
|
|
366
|
+
preceding_schema_version: Optional[int] = None
|
|
353
367
|
) -> None:
|
|
354
368
|
"""Writes table metadata to the database.
|
|
355
369
|
|
|
@@ -710,20 +724,22 @@ class TableVersion:
|
|
|
710
724
|
|
|
711
725
|
if conn is None:
|
|
712
726
|
with Env.get().engine.begin() as conn:
|
|
713
|
-
return self._insert(
|
|
727
|
+
return self._insert(
|
|
728
|
+
plan, conn, time.time(), print_stats=print_stats, rowids=rowids(), abort_on_exc=fail_on_exception)
|
|
714
729
|
else:
|
|
715
|
-
return self._insert(
|
|
730
|
+
return self._insert(
|
|
731
|
+
plan, conn, time.time(), print_stats=print_stats, rowids=rowids(), abort_on_exc=fail_on_exception)
|
|
716
732
|
|
|
717
733
|
def _insert(
|
|
718
734
|
self, exec_plan: 'exec.ExecNode', conn: sql.engine.Connection, timestamp: float, *,
|
|
719
|
-
rowids: Optional[Iterator[int]] = None, print_stats: bool = False,
|
|
735
|
+
rowids: Optional[Iterator[int]] = None, print_stats: bool = False, abort_on_exc: bool = False
|
|
720
736
|
) -> UpdateStatus:
|
|
721
737
|
"""Insert rows produced by exec_plan and propagate to views"""
|
|
722
738
|
# we're creating a new version
|
|
723
739
|
self.version += 1
|
|
724
740
|
result = UpdateStatus()
|
|
725
741
|
num_rows, num_excs, cols_with_excs = self.store_tbl.insert_rows(
|
|
726
|
-
exec_plan, conn, v_min=self.version, rowids=rowids)
|
|
742
|
+
exec_plan, conn, v_min=self.version, rowids=rowids, abort_on_exc=abort_on_exc)
|
|
727
743
|
result.num_rows = num_rows
|
|
728
744
|
result.num_excs = num_excs
|
|
729
745
|
result.num_computed_values += exec_plan.ctx.num_computed_exprs * num_rows
|
|
@@ -1203,7 +1219,8 @@ class TableVersion:
|
|
|
1203
1219
|
name=self.name, current_version=self.version, current_schema_version=self.schema_version,
|
|
1204
1220
|
next_col_id=self.next_col_id, next_idx_id=self.next_idx_id, next_row_id=self.next_rowid,
|
|
1205
1221
|
column_md=self._create_column_md(self.cols), index_md=self.idx_md,
|
|
1206
|
-
external_stores=self._create_stores_md(self.external_stores.values()), view_md=self.view_md
|
|
1222
|
+
external_stores=self._create_stores_md(self.external_stores.values()), view_md=self.view_md,
|
|
1223
|
+
)
|
|
1207
1224
|
|
|
1208
1225
|
def _create_version_md(self, timestamp: float) -> schema.TableVersionMd:
|
|
1209
1226
|
return schema.TableVersionMd(created_at=timestamp, version=self.version, schema_version=self.schema_version)
|
|
@@ -1211,11 +1228,14 @@ class TableVersion:
|
|
|
1211
1228
|
def _create_schema_version_md(self, preceding_schema_version: int) -> schema.TableSchemaVersionMd:
|
|
1212
1229
|
column_md: dict[int, schema.SchemaColumn] = {}
|
|
1213
1230
|
for pos, col in enumerate(self.cols_by_name.values()):
|
|
1214
|
-
column_md[col.id] = schema.SchemaColumn(
|
|
1231
|
+
column_md[col.id] = schema.SchemaColumn(
|
|
1232
|
+
pos=pos, name=col.name,
|
|
1233
|
+
media_validation=col._media_validation.name.lower() if col._media_validation is not None else None)
|
|
1215
1234
|
# preceding_schema_version to be set by the caller
|
|
1216
1235
|
return schema.TableSchemaVersionMd(
|
|
1217
1236
|
schema_version=self.schema_version, preceding_schema_version=preceding_schema_version,
|
|
1218
|
-
columns=column_md, num_retained_versions=self.num_retained_versions, comment=self.comment
|
|
1237
|
+
columns=column_md, num_retained_versions=self.num_retained_versions, comment=self.comment,
|
|
1238
|
+
media_validation=self.media_validation.name.lower())
|
|
1219
1239
|
|
|
1220
1240
|
def as_dict(self) -> dict:
|
|
1221
1241
|
return {'id': str(self.id), 'effective_version': self.effective_version}
|
pixeltable/catalog/view.py
CHANGED
|
@@ -2,24 +2,21 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import inspect
|
|
4
4
|
import logging
|
|
5
|
-
from typing import TYPE_CHECKING, Any,
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Iterable, Optional
|
|
6
6
|
from uuid import UUID
|
|
7
7
|
|
|
8
8
|
import sqlalchemy.orm as orm
|
|
9
9
|
|
|
10
|
-
import pixeltable.catalog as catalog
|
|
11
10
|
import pixeltable.exceptions as excs
|
|
12
|
-
import pixeltable.exprs as exprs
|
|
13
|
-
import pixeltable.func as func
|
|
14
11
|
import pixeltable.metadata.schema as md_schema
|
|
12
|
+
import pixeltable.type_system as ts
|
|
13
|
+
from pixeltable import catalog, exprs, func
|
|
15
14
|
from pixeltable.env import Env
|
|
16
|
-
from pixeltable.exceptions import Error
|
|
17
15
|
from pixeltable.iterators import ComponentIterator
|
|
18
|
-
from pixeltable.type_system import IntType, InvalidType
|
|
19
16
|
|
|
20
17
|
from .catalog import Catalog
|
|
21
18
|
from .column import Column
|
|
22
|
-
from .globals import _POS_COLUMN_NAME, UpdateStatus
|
|
19
|
+
from .globals import _POS_COLUMN_NAME, UpdateStatus, MediaValidation
|
|
23
20
|
from .table import Table
|
|
24
21
|
from .table_version import TableVersion
|
|
25
22
|
from .table_version_path import TableVersionPath
|
|
@@ -52,9 +49,10 @@ class View(Table):
|
|
|
52
49
|
|
|
53
50
|
@classmethod
|
|
54
51
|
def _create(
|
|
55
|
-
cls, dir_id: UUID, name: str, base: TableVersionPath, additional_columns:
|
|
52
|
+
cls, dir_id: UUID, name: str, base: TableVersionPath, additional_columns: dict[str, Any],
|
|
56
53
|
predicate: Optional['pxt.exprs.Expr'], is_snapshot: bool, num_retained_versions: int, comment: str,
|
|
57
|
-
|
|
54
|
+
media_validation: MediaValidation,
|
|
55
|
+
iterator_cls: Optional[type[ComponentIterator]], iterator_args: Optional[dict]
|
|
58
56
|
) -> View:
|
|
59
57
|
columns = cls._create_columns(additional_columns)
|
|
60
58
|
cls._verify_schema(columns)
|
|
@@ -92,17 +90,17 @@ class View(Table):
|
|
|
92
90
|
func.Parameter(param_name, param_type, kind=inspect.Parameter.POSITIONAL_OR_KEYWORD)
|
|
93
91
|
for param_name, param_type in iterator_cls.input_schema().items()
|
|
94
92
|
]
|
|
95
|
-
sig = func.Signature(InvalidType(), params)
|
|
93
|
+
sig = func.Signature(ts.InvalidType(), params)
|
|
96
94
|
from pixeltable.exprs import FunctionCall
|
|
97
95
|
FunctionCall.normalize_args(iterator_cls.__name__, sig, bound_args)
|
|
98
96
|
except TypeError as e:
|
|
99
|
-
raise Error(f'Cannot instantiate iterator with given arguments: {e}')
|
|
97
|
+
raise excs.Error(f'Cannot instantiate iterator with given arguments: {e}')
|
|
100
98
|
|
|
101
99
|
# prepend pos and output_schema columns to cols:
|
|
102
100
|
# a component view exposes the pos column of its rowid;
|
|
103
101
|
# we create that column here, so it gets assigned a column id;
|
|
104
102
|
# stored=False: it is not stored separately (it's already stored as part of the rowid)
|
|
105
|
-
iterator_cols = [Column(_POS_COLUMN_NAME, IntType(), stored=False)]
|
|
103
|
+
iterator_cols = [Column(_POS_COLUMN_NAME, ts.IntType(), stored=False)]
|
|
106
104
|
output_dict, unstored_cols = iterator_cls.output_schema(**bound_args)
|
|
107
105
|
iterator_cols.extend([
|
|
108
106
|
Column(col_name, col_type, stored=col_name not in unstored_cols)
|
|
@@ -112,12 +110,12 @@ class View(Table):
|
|
|
112
110
|
iterator_col_names = {col.name for col in iterator_cols}
|
|
113
111
|
for col in columns:
|
|
114
112
|
if col.name in iterator_col_names:
|
|
115
|
-
raise Error(f'Duplicate name: column {col.name} is already present in the iterator output schema')
|
|
113
|
+
raise excs.Error(f'Duplicate name: column {col.name} is already present in the iterator output schema')
|
|
116
114
|
columns = iterator_cols + columns
|
|
117
115
|
|
|
118
116
|
with orm.Session(Env.get().engine, future=True) as session:
|
|
119
117
|
from pixeltable.exprs import InlineDict
|
|
120
|
-
iterator_args_expr = InlineDict(iterator_args) if iterator_args is not None else None
|
|
118
|
+
iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
|
|
121
119
|
iterator_class_fqn = f'{iterator_cls.__module__}.{iterator_cls.__name__}' if iterator_cls is not None \
|
|
122
120
|
else None
|
|
123
121
|
base_version_path = cls._get_snapshot_path(base) if is_snapshot else base
|
|
@@ -142,7 +140,8 @@ class View(Table):
|
|
|
142
140
|
iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None)
|
|
143
141
|
|
|
144
142
|
id, tbl_version = TableVersion.create(
|
|
145
|
-
session, dir_id, name, columns, num_retained_versions, comment,
|
|
143
|
+
session, dir_id, name, columns, num_retained_versions, comment, media_validation=media_validation,
|
|
144
|
+
base_path=base_version_path, view_md=view_md)
|
|
146
145
|
if tbl_version is None:
|
|
147
146
|
# this is purely a snapshot: we use the base's tbl version path
|
|
148
147
|
view = cls(id, dir_id, name, base_version_path, base.tbl_id(), snapshot_only=True)
|
|
@@ -168,11 +167,11 @@ class View(Table):
|
|
|
168
167
|
|
|
169
168
|
@classmethod
|
|
170
169
|
def _verify_column(
|
|
171
|
-
cls, col: Column, existing_column_names:
|
|
170
|
+
cls, col: Column, existing_column_names: set[str], existing_query_names: Optional[set[str]] = None
|
|
172
171
|
) -> None:
|
|
173
172
|
# make sure that columns are nullable or have a default
|
|
174
173
|
if not col.col_type.nullable and not col.is_computed:
|
|
175
|
-
raise Error(f'Column {col.name}: non-computed columns in views must be nullable')
|
|
174
|
+
raise excs.Error(f'Column {col.name}: non-computed columns in views must be nullable')
|
|
176
175
|
super()._verify_column(col, existing_column_names, existing_query_names)
|
|
177
176
|
|
|
178
177
|
@classmethod
|
pixeltable/dataframe.py
CHANGED
|
@@ -371,15 +371,10 @@ class DataFrame:
|
|
|
371
371
|
group_by_clause=group_by_clause, grouping_tbl=self.grouping_tbl,
|
|
372
372
|
order_by_clause=order_by_clause, limit=self.limit_val)
|
|
373
373
|
|
|
374
|
-
def
|
|
375
|
-
return self._collect()
|
|
376
|
-
|
|
377
|
-
def _collect(self, conn: Optional[sql.engine.Connection] = None) -> DataFrameResultSet:
|
|
374
|
+
def _output_row_iterator(self, conn: Optional[sql.engine.Connection] = None) -> Iterator[list]:
|
|
378
375
|
try:
|
|
379
|
-
result_rows = []
|
|
380
376
|
for data_row in self._exec(conn):
|
|
381
|
-
|
|
382
|
-
result_rows.append(result_row)
|
|
377
|
+
yield [data_row[e.slot_idx] for e in self._select_list_exprs]
|
|
383
378
|
except excs.ExprEvalError as e:
|
|
384
379
|
msg = f'In row {e.row_num} the {e.expr_msg} encountered exception ' f'{type(e.exc).__name__}:\n{str(e.exc)}'
|
|
385
380
|
if len(e.input_vals) > 0:
|
|
@@ -399,7 +394,11 @@ class DataFrame:
|
|
|
399
394
|
except sql.exc.DBAPIError as e:
|
|
400
395
|
raise excs.Error(f'Error during SQL execution:\n{e}')
|
|
401
396
|
|
|
402
|
-
|
|
397
|
+
def collect(self) -> DataFrameResultSet:
|
|
398
|
+
return self._collect()
|
|
399
|
+
|
|
400
|
+
def _collect(self, conn: Optional[sql.engine.Connection] = None) -> DataFrameResultSet:
|
|
401
|
+
return DataFrameResultSet(list(self._output_row_iterator(conn)), self.schema)
|
|
403
402
|
|
|
404
403
|
def count(self) -> int:
|
|
405
404
|
from pixeltable.plan import Planner
|
pixeltable/env.py
CHANGED
|
@@ -494,13 +494,18 @@ class Env:
|
|
|
494
494
|
self.__register_package('anthropic')
|
|
495
495
|
self.__register_package('boto3')
|
|
496
496
|
self.__register_package('datasets')
|
|
497
|
+
self.__register_package('fiftyone')
|
|
497
498
|
self.__register_package('fireworks', library_name='fireworks-ai')
|
|
499
|
+
self.__register_package('huggingface_hub', library_name='huggingface-hub')
|
|
498
500
|
self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
|
|
501
|
+
self.__register_package('llama_cpp', library_name='llama-cpp-python')
|
|
499
502
|
self.__register_package('mistralai')
|
|
500
503
|
self.__register_package('mistune')
|
|
504
|
+
self.__register_package('ollama')
|
|
501
505
|
self.__register_package('openai')
|
|
502
506
|
self.__register_package('openpyxl')
|
|
503
507
|
self.__register_package('pyarrow')
|
|
508
|
+
self.__register_package('replicate')
|
|
504
509
|
self.__register_package('sentence_transformers', library_name='sentence-transformers')
|
|
505
510
|
self.__register_package('spacy')
|
|
506
511
|
self.__register_package('tiktoken')
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -6,6 +6,5 @@ from .exec_context import ExecContext
|
|
|
6
6
|
from .exec_node import ExecNode
|
|
7
7
|
from .expr_eval_node import ExprEvalNode
|
|
8
8
|
from .in_memory_data_node import InMemoryDataNode
|
|
9
|
-
from .media_validation_node import MediaValidationNode
|
|
10
9
|
from .row_update_node import RowUpdateNode
|
|
11
10
|
from .sql_node import SqlLookupNode, SqlScanNode, SqlAggregationNode, SqlNode
|
|
@@ -2,11 +2,12 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
|
-
from typing import Iterable,
|
|
5
|
+
from typing import Any, Iterable, Iterator, Optional, cast
|
|
6
6
|
|
|
7
7
|
import pixeltable.catalog as catalog
|
|
8
8
|
import pixeltable.exceptions as excs
|
|
9
9
|
import pixeltable.exprs as exprs
|
|
10
|
+
|
|
10
11
|
from .data_row_batch import DataRowBatch
|
|
11
12
|
from .exec_node import ExecNode
|
|
12
13
|
|
|
@@ -28,13 +29,15 @@ class AggregationNode(ExecNode):
|
|
|
28
29
|
self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: Optional[list[exprs.Expr]],
|
|
29
30
|
agg_fn_calls: list[exprs.FunctionCall], input_exprs: Iterable[exprs.Expr], input: ExecNode
|
|
30
31
|
):
|
|
31
|
-
|
|
32
|
+
output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
|
|
33
|
+
output_exprs.extend(agg_fn_calls)
|
|
34
|
+
super().__init__(row_builder, output_exprs, input_exprs, input)
|
|
32
35
|
self.input = input
|
|
33
36
|
self.group_by = group_by
|
|
34
37
|
self.input_exprs = list(input_exprs)
|
|
35
38
|
self.agg_fn_eval_ctx = row_builder.create_eval_ctx(agg_fn_calls, exclude=self.input_exprs)
|
|
36
39
|
# we need to make sure to refer to the same exprs that RowBuilder.eval() will use
|
|
37
|
-
self.agg_fn_calls = self.agg_fn_eval_ctx.target_exprs
|
|
40
|
+
self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
|
|
38
41
|
# create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
|
|
39
42
|
self.output_batch = DataRowBatch(tbl, row_builder, 0)
|
|
40
43
|
|
|
@@ -79,7 +79,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
79
79
|
|
|
80
80
|
return input_batch
|
|
81
81
|
|
|
82
|
-
def _fetch_url(self, row: exprs.DataRow, slot_idx: int) -> Optional[
|
|
82
|
+
def _fetch_url(self, row: exprs.DataRow, slot_idx: int) -> Optional[Path]:
|
|
83
83
|
"""Fetches a remote URL into Env.tmp_dir and returns its path"""
|
|
84
84
|
url = row.file_urls[slot_idx]
|
|
85
85
|
parsed = urllib.parse.urlparse(url)
|
|
@@ -49,7 +49,7 @@ class DataRowBatch:
|
|
|
49
49
|
def __len__(self) -> int:
|
|
50
50
|
return len(self.rows)
|
|
51
51
|
|
|
52
|
-
def __getitem__(self, index:
|
|
52
|
+
def __getitem__(self, index: int) -> exprs.DataRow:
|
|
53
53
|
return self.rows[index]
|
|
54
54
|
|
|
55
55
|
def flush_imgs(
|
|
@@ -74,21 +74,4 @@ class DataRowBatch:
|
|
|
74
74
|
row.flush_img(slot_idx)
|
|
75
75
|
|
|
76
76
|
def __iter__(self) -> Iterator[exprs.DataRow]:
|
|
77
|
-
return
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
class DataRowBatchIterator:
|
|
81
|
-
"""
|
|
82
|
-
Iterator over a DataRowBatch.
|
|
83
|
-
"""
|
|
84
|
-
def __init__(self, batch: DataRowBatch):
|
|
85
|
-
self.row_batch = batch
|
|
86
|
-
self.index = 0
|
|
87
|
-
|
|
88
|
-
def __next__(self) -> exprs.DataRow:
|
|
89
|
-
if self.index >= len(self.row_batch.rows):
|
|
90
|
-
raise StopIteration
|
|
91
|
-
row = self.row_batch.rows[self.index]
|
|
92
|
-
self.index += 1
|
|
93
|
-
return row
|
|
94
|
-
|
|
77
|
+
return iter(self.rows)
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
|
-
from typing import Iterable,
|
|
4
|
+
from typing import TYPE_CHECKING, Iterable, Iterator, List, Optional
|
|
5
5
|
|
|
6
6
|
import pixeltable.exprs as exprs
|
|
7
|
+
|
|
7
8
|
from .data_row_batch import DataRowBatch
|
|
8
9
|
from .exec_context import ExecContext
|
|
9
10
|
|