pixeltable 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +53 -0
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +181 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +192 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +695 -0
- pixeltable/catalog/table_version.py +1026 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/dataframe.py +749 -0
- pixeltable/env.py +466 -0
- pixeltable/exceptions.py +17 -0
- pixeltable/exec/__init__.py +10 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +116 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +94 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +73 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +226 -0
- pixeltable/exprs/__init__.py +25 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +114 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +199 -0
- pixeltable/exprs/expr.py +594 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +382 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +96 -0
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +109 -0
- pixeltable/exprs/inline_dict.py +103 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +66 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +329 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/similarity_expr.py +65 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/__init__.py +7 -0
- pixeltable/func/aggregate_function.py +197 -0
- pixeltable/func/callable_function.py +113 -0
- pixeltable/func/expr_template_function.py +99 -0
- pixeltable/func/function.py +141 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +46 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +162 -0
- pixeltable/func/udf.py +164 -0
- pixeltable/functions/__init__.py +95 -0
- pixeltable/functions/eval.py +215 -0
- pixeltable/functions/fireworks.py +34 -0
- pixeltable/functions/huggingface.py +167 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +289 -0
- pixeltable/functions/pil/image.py +147 -0
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +143 -0
- pixeltable/functions/util.py +52 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/globals.py +425 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +51 -0
- pixeltable/index/embedding_index.py +168 -0
- pixeltable/io/__init__.py +3 -0
- pixeltable/io/hf_datasets.py +188 -0
- pixeltable/io/pandas.py +148 -0
- pixeltable/io/parquet.py +192 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +52 -0
- pixeltable/iterators/document.py +432 -0
- pixeltable/iterators/video.py +88 -0
- pixeltable/metadata/__init__.py +58 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/schema.py +234 -0
- pixeltable/plan.py +620 -0
- pixeltable/store.py +424 -0
- pixeltable/tool/create_test_db_dump.py +184 -0
- pixeltable/tool/create_test_video.py +81 -0
- pixeltable/type_system.py +846 -0
- pixeltable/utils/__init__.py +17 -0
- pixeltable/utils/arrow.py +98 -0
- pixeltable/utils/clip.py +18 -0
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +69 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/http_server.py +70 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/pytorch.py +91 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.0.0.dist-info/LICENSE +18 -0
- pixeltable-0.0.0.dist-info/METADATA +131 -0
- pixeltable-0.0.0.dist-info/RECORD +119 -0
- pixeltable-0.0.0.dist-info/WHEEL +4 -0
pixeltable/__init__.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from .catalog import Column, Table, InsertableTable, View
|
|
2
|
+
from .dataframe import DataFrame
|
|
3
|
+
from .exceptions import Error, Error
|
|
4
|
+
from .exprs import RELATIVE_PATH_ROOT
|
|
5
|
+
from .func import Function, udf, uda, Aggregator, expr_udf
|
|
6
|
+
from .globals import *
|
|
7
|
+
from .type_system import (
|
|
8
|
+
ColumnType,
|
|
9
|
+
StringType,
|
|
10
|
+
IntType,
|
|
11
|
+
FloatType,
|
|
12
|
+
BoolType,
|
|
13
|
+
TimestampType,
|
|
14
|
+
JsonType,
|
|
15
|
+
ArrayType,
|
|
16
|
+
ImageType,
|
|
17
|
+
VideoType,
|
|
18
|
+
AudioType,
|
|
19
|
+
DocumentType,
|
|
20
|
+
)
|
|
21
|
+
from .utils.help import help
|
|
22
|
+
|
|
23
|
+
# noinspection PyUnresolvedReferences
|
|
24
|
+
from . import functions, io
|
|
25
|
+
from .__version__ import __version__, __version_tuple__
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
'DataFrame',
|
|
29
|
+
'Column',
|
|
30
|
+
'Table',
|
|
31
|
+
'InsertableTable',
|
|
32
|
+
'View',
|
|
33
|
+
'Error',
|
|
34
|
+
'ColumnType',
|
|
35
|
+
'StringType',
|
|
36
|
+
'IntType',
|
|
37
|
+
'FloatType',
|
|
38
|
+
'BoolType',
|
|
39
|
+
'TimestampType',
|
|
40
|
+
'JsonType',
|
|
41
|
+
'RELATIVE_PATH_ROOT',
|
|
42
|
+
'ArrayType',
|
|
43
|
+
'ImageType',
|
|
44
|
+
'VideoType',
|
|
45
|
+
'AudioType',
|
|
46
|
+
'DocumentType',
|
|
47
|
+
'Function',
|
|
48
|
+
'help',
|
|
49
|
+
'udf',
|
|
50
|
+
'Aggregator',
|
|
51
|
+
'uda',
|
|
52
|
+
'expr_udf',
|
|
53
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .catalog import Catalog
|
|
2
|
+
from .column import Column
|
|
3
|
+
from .table_version_path import TableVersionPath
|
|
4
|
+
from .table_version import TableVersion
|
|
5
|
+
from .schema_object import SchemaObject
|
|
6
|
+
from .named_function import NamedFunction
|
|
7
|
+
from .dir import Dir
|
|
8
|
+
from .table import Table
|
|
9
|
+
from .insertable_table import InsertableTable
|
|
10
|
+
from .view import View
|
|
11
|
+
from .path import Path
|
|
12
|
+
from .path_dict import PathDict
|
|
13
|
+
from .globals import is_valid_identifier, is_valid_path
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Optional, List, Any, Dict, Tuple
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
import dataclasses
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import sqlalchemy as sql
|
|
8
|
+
import sqlalchemy.orm as orm
|
|
9
|
+
|
|
10
|
+
from .table_version import TableVersion
|
|
11
|
+
from .table_version_path import TableVersionPath
|
|
12
|
+
from .table import Table
|
|
13
|
+
from .named_function import NamedFunction
|
|
14
|
+
from .path_dict import PathDict
|
|
15
|
+
import pixeltable.env as env
|
|
16
|
+
import pixeltable.metadata.schema as schema
|
|
17
|
+
|
|
18
|
+
_logger = logging.getLogger('pixeltable')
|
|
19
|
+
|
|
20
|
+
class Catalog:
|
|
21
|
+
"""A repository of catalog objects"""
|
|
22
|
+
_instance: Optional[Catalog] = None
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def get(cls) -> Catalog:
|
|
26
|
+
if cls._instance is None:
|
|
27
|
+
cls._instance = cls()
|
|
28
|
+
with orm.Session(env.Env.get().engine, future=True) as session:
|
|
29
|
+
cls._instance._load_table_versions(session)
|
|
30
|
+
#cls._instance._load_functions(session)
|
|
31
|
+
return cls._instance
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def clear(cls) -> None:
|
|
35
|
+
"""Remove the instance. Used for testing."""
|
|
36
|
+
cls._instance = None
|
|
37
|
+
|
|
38
|
+
def __init__(self) -> None:
|
|
39
|
+
# key: [id, version]
|
|
40
|
+
# - mutable version of a table: version == None (even though TableVersion.version is set correctly)
|
|
41
|
+
# - snapshot versions: records the version of the snapshot
|
|
42
|
+
self.tbl_versions: Dict[Tuple[UUID, int], TableVersion] = {}
|
|
43
|
+
|
|
44
|
+
self.tbls: Dict[UUID, Table] = {} # don't use a defaultdict here, it doesn't cooperate with the debugger
|
|
45
|
+
self.tbl_dependents: Dict[UUID, List[Table]] = {}
|
|
46
|
+
|
|
47
|
+
self._init_store()
|
|
48
|
+
self.paths = PathDict() # do this after _init_catalog()
|
|
49
|
+
|
|
50
|
+
def _init_store(self) -> None:
|
|
51
|
+
"""One-time initialization of the stored catalog. Idempotent."""
|
|
52
|
+
with orm.Session(env.Env.get().engine, future=True) as session:
|
|
53
|
+
if session.query(sql.func.count(schema.Dir.id)).scalar() > 0:
|
|
54
|
+
return
|
|
55
|
+
# create a top-level directory, so that every schema object has a directory
|
|
56
|
+
dir_md = schema.DirMd(name='')
|
|
57
|
+
dir_record = schema.Dir(parent_id=None, md=dataclasses.asdict(dir_md))
|
|
58
|
+
session.add(dir_record)
|
|
59
|
+
session.flush()
|
|
60
|
+
session.commit()
|
|
61
|
+
_logger.info(f'Initialized catalog')
|
|
62
|
+
|
|
63
|
+
def _load_snapshot_version(
|
|
64
|
+
self, tbl_id: UUID, version: int, base: Optional[TableVersion], session: orm.Session
|
|
65
|
+
) -> TableVersion:
|
|
66
|
+
q = session.query(schema.Table, schema.TableSchemaVersion) \
|
|
67
|
+
.select_from(schema.Table) \
|
|
68
|
+
.join(schema.TableVersion) \
|
|
69
|
+
.join(schema.TableSchemaVersion) \
|
|
70
|
+
.where(schema.Table.id == tbl_id) \
|
|
71
|
+
.where(sql.text(f"({schema.TableVersion.__table__}.md->>'version')::int = {version}")) \
|
|
72
|
+
.where(sql.text((
|
|
73
|
+
f"({schema.TableVersion.__table__}.md->>'schema_version')::int = "
|
|
74
|
+
f"{schema.TableSchemaVersion.__table__}.{schema.TableSchemaVersion.schema_version.name}")))
|
|
75
|
+
tbl_record, schema_version_record = q.one()
|
|
76
|
+
tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
|
|
77
|
+
schema_version_md = schema.md_from_dict(schema.TableSchemaVersionMd, schema_version_record.md)
|
|
78
|
+
# we ignore tbl_record.base_tbl_id/base_snapshot_id and use 'base' instead: if the base is a snapshot
|
|
79
|
+
# we'd have to look that up first
|
|
80
|
+
return TableVersion(tbl_record.id, tbl_md, version, schema_version_md, is_snapshot=True, base=base)
|
|
81
|
+
|
|
82
|
+
def _load_table_versions(self, session: orm.Session) -> None:
|
|
83
|
+
from .insertable_table import InsertableTable
|
|
84
|
+
from .view import View
|
|
85
|
+
|
|
86
|
+
# load tables/views;
|
|
87
|
+
# do this in ascending order of creation ts so that we can resolve base references in one pass
|
|
88
|
+
q = session.query(schema.Table, schema.TableSchemaVersion) \
|
|
89
|
+
.select_from(schema.Table) \
|
|
90
|
+
.join(schema.TableVersion) \
|
|
91
|
+
.join(schema.TableSchemaVersion) \
|
|
92
|
+
.where(sql.text(f"({schema.TableVersion.__table__}.md->>'version')::int = 0")) \
|
|
93
|
+
.where(sql.text((
|
|
94
|
+
f"({schema.Table.__table__}.md->>'current_schema_version')::int = "
|
|
95
|
+
f"{schema.TableSchemaVersion.__table__}.{schema.TableSchemaVersion.schema_version.name}"))) \
|
|
96
|
+
.order_by(sql.text(f"({schema.TableVersion.__table__}.md->>'created_at')::float"))
|
|
97
|
+
|
|
98
|
+
for tbl_record, schema_version_record in q.all():
|
|
99
|
+
tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
|
|
100
|
+
schema_version_md = schema.md_from_dict(schema.TableSchemaVersionMd, schema_version_record.md)
|
|
101
|
+
view_md = tbl_md.view_md
|
|
102
|
+
|
|
103
|
+
if view_md is not None:
|
|
104
|
+
assert len(view_md.base_versions) > 0
|
|
105
|
+
# construct a TableVersionPath for the view
|
|
106
|
+
refd_versions = [(UUID(tbl_id), version) for tbl_id, version in view_md.base_versions]
|
|
107
|
+
base_path: Optional[TableVersionPath] = None
|
|
108
|
+
base: Optional[TableVersion] = None
|
|
109
|
+
# go through the versions in reverse order, so we can construct TableVersionPaths
|
|
110
|
+
for base_id, version in refd_versions[::-1]:
|
|
111
|
+
base_version = self.tbl_versions.get((base_id, version), None)
|
|
112
|
+
if base_version is None:
|
|
113
|
+
if version is None:
|
|
114
|
+
# debugging
|
|
115
|
+
pass
|
|
116
|
+
# if this is a reference to a mutable table, we should have loaded it already
|
|
117
|
+
assert version is not None
|
|
118
|
+
base_version = self._load_snapshot_version(base_id, version, base, session)
|
|
119
|
+
base_path = TableVersionPath(base_version, base=base_path)
|
|
120
|
+
base = base_version
|
|
121
|
+
assert base_path is not None
|
|
122
|
+
|
|
123
|
+
base_tbl = self.tbls[base_path.tbl_version.id]
|
|
124
|
+
is_snapshot = view_md is not None and view_md.is_snapshot
|
|
125
|
+
snapshot_only = is_snapshot and view_md.predicate is None and len(schema_version_md.columns) == 0
|
|
126
|
+
if snapshot_only:
|
|
127
|
+
# this is a pure snapshot, without a physical table backing it
|
|
128
|
+
view_path = base_path
|
|
129
|
+
else:
|
|
130
|
+
tbl_version = TableVersion(
|
|
131
|
+
tbl_record.id, tbl_md, tbl_md.current_version, schema_version_md, is_snapshot=is_snapshot,
|
|
132
|
+
base=base_path.tbl_version if is_snapshot else None,
|
|
133
|
+
base_path=base_path if not is_snapshot else None)
|
|
134
|
+
view_path = TableVersionPath(tbl_version, base=base_path)
|
|
135
|
+
|
|
136
|
+
tbl = View(
|
|
137
|
+
tbl_record.id, tbl_record.dir_id, tbl_md.name, view_path, base_tbl,
|
|
138
|
+
snapshot_only=snapshot_only)
|
|
139
|
+
self.tbl_dependents[base_tbl._id].append(tbl)
|
|
140
|
+
|
|
141
|
+
else:
|
|
142
|
+
tbl_version = TableVersion(tbl_record.id, tbl_md, tbl_md.current_version, schema_version_md)
|
|
143
|
+
tbl = InsertableTable(tbl_record.dir_id, tbl_version)
|
|
144
|
+
|
|
145
|
+
self.tbls[tbl._id] = tbl
|
|
146
|
+
self.tbl_dependents[tbl._id] = []
|
|
147
|
+
self.paths.add_schema_obj(tbl._dir_id, tbl_md.name, tbl)
|
|
148
|
+
|
|
149
|
+
# def _load_functions(self, session: orm.Session) -> None:
|
|
150
|
+
# # load Function metadata; doesn't load the actual callable, which can be large and is only done on-demand by the
|
|
151
|
+
# # FunctionRegistry
|
|
152
|
+
# q = session.query(schema.Function.id, schema.Function.dir_id, schema.Function.md) \
|
|
153
|
+
# .where(sql.text(f"({schema.Function.__table__}.md->>'name')::text IS NOT NULL"))
|
|
154
|
+
# for id, dir_id, md in q.all():
|
|
155
|
+
# assert 'name' in md
|
|
156
|
+
# name = md['name']
|
|
157
|
+
# assert name is not None
|
|
158
|
+
# named_fn = NamedFunction(id, dir_id, name)
|
|
159
|
+
# self.paths.add_schema_obj(dir_id, name, named_fn)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional, Union, Callable, Set
|
|
5
|
+
|
|
6
|
+
import sqlalchemy as sql
|
|
7
|
+
|
|
8
|
+
import pixeltable.exceptions as excs
|
|
9
|
+
import pixeltable.type_system as ts
|
|
10
|
+
from .globals import is_valid_identifier
|
|
11
|
+
|
|
12
|
+
_logger = logging.getLogger('pixeltable')
|
|
13
|
+
|
|
14
|
+
class Column:
|
|
15
|
+
"""Representation of a column in the schema of a Table/DataFrame.
|
|
16
|
+
|
|
17
|
+
A Column contains all the metadata necessary for executing queries and updates against a particular version of a
|
|
18
|
+
table/view.
|
|
19
|
+
"""
|
|
20
|
+
def __init__(
|
|
21
|
+
self, name: Optional[str], col_type: Optional[ts.ColumnType] = None,
|
|
22
|
+
computed_with: Optional[Union['Expr', Callable]] = None,
|
|
23
|
+
is_pk: bool = False, stored: Optional[bool] = None,
|
|
24
|
+
col_id: Optional[int] = None, schema_version_add: Optional[int] = None,
|
|
25
|
+
schema_version_drop: Optional[int] = None, sa_col_type: Optional[sql.sqltypes.TypeEngine] = None
|
|
26
|
+
):
|
|
27
|
+
"""Column constructor.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
name: column name; None for system columns (eg, index columns)
|
|
31
|
+
col_type: column type; can be None if the type can be derived from ``computed_with``
|
|
32
|
+
computed_with: a callable or an Expr object that computes the column value
|
|
33
|
+
is_pk: if True, this column is part of the primary key
|
|
34
|
+
stored: determines whether a computed column is present in the stored table or recomputed on demand
|
|
35
|
+
col_id: column ID (only used internally)
|
|
36
|
+
|
|
37
|
+
Computed columns: those have a non-None ``computed_with`` argument
|
|
38
|
+
- when constructed by the user: ``computed_with`` was constructed explicitly and is passed in;
|
|
39
|
+
col_type is None
|
|
40
|
+
- when loaded from md store: ``computed_with`` is set and col_type is set
|
|
41
|
+
|
|
42
|
+
``computed_with`` is a Callable:
|
|
43
|
+
- the callable's parameter names must correspond to existing columns in the table for which this Column
|
|
44
|
+
is being used
|
|
45
|
+
- ``col_type`` needs to be set to the callable's return type
|
|
46
|
+
|
|
47
|
+
``stored`` (only valid for computed image columns):
|
|
48
|
+
- if True: the column is present in the stored table
|
|
49
|
+
- if False: the column is not present in the stored table and recomputed during a query
|
|
50
|
+
- if None: the system chooses for you (at present, this is always False, but this may change in the future)
|
|
51
|
+
"""
|
|
52
|
+
if name is not None and not is_valid_identifier(name):
|
|
53
|
+
raise excs.Error(f"Invalid column name: '{name}'")
|
|
54
|
+
self.name = name
|
|
55
|
+
if col_type is None and computed_with is None:
|
|
56
|
+
raise excs.Error(f'Column `{name}`: col_type is required if computed_with is not specified')
|
|
57
|
+
|
|
58
|
+
self.value_expr: Optional['Expr'] = None
|
|
59
|
+
self.compute_func: Optional[Callable] = None
|
|
60
|
+
from pixeltable import exprs
|
|
61
|
+
if computed_with is not None:
|
|
62
|
+
value_expr = exprs.Expr.from_object(computed_with)
|
|
63
|
+
if value_expr is None:
|
|
64
|
+
# computed_with needs to be a Callable
|
|
65
|
+
if not isinstance(computed_with, Callable):
|
|
66
|
+
raise excs.Error(
|
|
67
|
+
f'Column {name}: computed_with needs to be either a Pixeltable expression or a Callable, '
|
|
68
|
+
f'but it is a {type(computed_with)}')
|
|
69
|
+
if col_type is None:
|
|
70
|
+
raise excs.Error(f'Column {name}: col_type is required if computed_with is a Callable')
|
|
71
|
+
# we need to turn the computed_with function into an Expr, but this requires resolving
|
|
72
|
+
# column name references and for that we need to wait until we're assigned to a Table
|
|
73
|
+
self.compute_func = computed_with
|
|
74
|
+
else:
|
|
75
|
+
self.value_expr = value_expr.copy()
|
|
76
|
+
self.col_type = self.value_expr.col_type
|
|
77
|
+
|
|
78
|
+
if col_type is not None:
|
|
79
|
+
self.col_type = col_type
|
|
80
|
+
assert self.col_type is not None
|
|
81
|
+
|
|
82
|
+
self.stored = stored
|
|
83
|
+
self.dependent_cols: Set[Column] = set() # cols with value_exprs that reference us; set by TableVersion
|
|
84
|
+
self.id = col_id
|
|
85
|
+
self.is_pk = is_pk
|
|
86
|
+
self.schema_version_add = schema_version_add
|
|
87
|
+
self.schema_version_drop = schema_version_drop
|
|
88
|
+
|
|
89
|
+
# column in the stored table for the values of this Column
|
|
90
|
+
self.sa_col: Optional[sql.schema.Column] = None
|
|
91
|
+
self.sa_col_type = sa_col_type
|
|
92
|
+
|
|
93
|
+
# computed cols also have storage columns for the exception string and type
|
|
94
|
+
self.sa_errormsg_col: Optional[sql.schema.Column] = None
|
|
95
|
+
self.sa_errortype_col: Optional[sql.schema.Column] = None
|
|
96
|
+
from .table_version import TableVersion
|
|
97
|
+
self.tbl: Optional[TableVersion] = None # set by owning TableVersion
|
|
98
|
+
|
|
99
|
+
def __hash__(self) -> int:
|
|
100
|
+
assert self.tbl is not None
|
|
101
|
+
return hash((self.tbl.id, self.id))
|
|
102
|
+
|
|
103
|
+
def check_value_expr(self) -> None:
|
|
104
|
+
assert self.value_expr is not None
|
|
105
|
+
if self.stored == False and self.is_computed and self.has_window_fn_call():
|
|
106
|
+
raise excs.Error(
|
|
107
|
+
f'Column {self.name}: stored={self.stored} not supported for columns computed with window functions:'
|
|
108
|
+
f'\n{self.value_expr}')
|
|
109
|
+
|
|
110
|
+
def has_window_fn_call(self) -> bool:
|
|
111
|
+
if self.value_expr is None:
|
|
112
|
+
return False
|
|
113
|
+
from pixeltable import exprs
|
|
114
|
+
l = list(self.value_expr.subexprs(filter=lambda e: isinstance(e, exprs.FunctionCall) and e.is_window_fn_call))
|
|
115
|
+
return len(l) > 0
|
|
116
|
+
|
|
117
|
+
def get_idx_info(self) -> dict[str, 'pixeltable.catalog.TableVersion.IndexInfo']:
|
|
118
|
+
assert self.tbl is not None
|
|
119
|
+
return {name: info for name, info in self.tbl.idxs_by_name.items() if info.col == self}
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def is_computed(self) -> bool:
|
|
123
|
+
return self.compute_func is not None or self.value_expr is not None
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def is_stored(self) -> bool:
|
|
127
|
+
"""Returns True if column is materialized in the stored table."""
|
|
128
|
+
assert self.stored is not None
|
|
129
|
+
return self.stored
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def records_errors(self) -> bool:
|
|
133
|
+
"""True if this column also stores error information."""
|
|
134
|
+
return self.is_stored and (self.is_computed or self.col_type.is_media_type())
|
|
135
|
+
|
|
136
|
+
def source(self) -> None:
|
|
137
|
+
"""
|
|
138
|
+
If this is a computed col and the top-level expr is a function call, print the source, if possible.
|
|
139
|
+
"""
|
|
140
|
+
from pixeltable import exprs
|
|
141
|
+
if self.value_expr is None or not isinstance(self.value_expr, exprs.FunctionCall):
|
|
142
|
+
return
|
|
143
|
+
self.value_expr.fn.source()
|
|
144
|
+
|
|
145
|
+
def create_sa_cols(self) -> None:
|
|
146
|
+
"""
|
|
147
|
+
These need to be recreated for every new table schema version.
|
|
148
|
+
"""
|
|
149
|
+
assert self.is_stored
|
|
150
|
+
# all storage columns are nullable (we deal with null errors in Pixeltable directly)
|
|
151
|
+
self.sa_col = sql.Column(
|
|
152
|
+
self.store_name(), self.col_type.to_sa_type() if self.sa_col_type is None else self.sa_col_type,
|
|
153
|
+
nullable=True)
|
|
154
|
+
if self.is_computed or self.col_type.is_media_type():
|
|
155
|
+
self.sa_errormsg_col = sql.Column(self.errormsg_store_name(), ts.StringType().to_sa_type(), nullable=True)
|
|
156
|
+
self.sa_errortype_col = sql.Column(self.errortype_store_name(), ts.StringType().to_sa_type(), nullable=True)
|
|
157
|
+
|
|
158
|
+
def get_sa_col_type(self) -> sql.sqltypes.TypeEngine:
|
|
159
|
+
return self.col_type.to_sa_type() if self.sa_col_type is None else self.sa_col_type
|
|
160
|
+
|
|
161
|
+
def store_name(self) -> str:
|
|
162
|
+
assert self.id is not None
|
|
163
|
+
assert self.is_stored
|
|
164
|
+
return f'col_{self.id}'
|
|
165
|
+
|
|
166
|
+
def errormsg_store_name(self) -> str:
|
|
167
|
+
return f'{self.store_name()}_errormsg'
|
|
168
|
+
|
|
169
|
+
def errortype_store_name(self) -> str:
|
|
170
|
+
return f'{self.store_name()}_errortype'
|
|
171
|
+
|
|
172
|
+
def __str__(self) -> str:
|
|
173
|
+
return f'{self.name}: {self.col_type}'
|
|
174
|
+
|
|
175
|
+
def __eq__(self, other: object) -> bool:
|
|
176
|
+
if not isinstance(other, Column):
|
|
177
|
+
return False
|
|
178
|
+
assert self.tbl is not None
|
|
179
|
+
assert other.tbl is not None
|
|
180
|
+
return self.tbl.id == other.tbl.id and self.id == other.id
|
|
181
|
+
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import dataclasses
|
|
4
|
+
import logging
|
|
5
|
+
from uuid import UUID
|
|
6
|
+
|
|
7
|
+
import sqlalchemy as sql
|
|
8
|
+
|
|
9
|
+
from .schema_object import SchemaObject
|
|
10
|
+
from pixeltable.env import Env
|
|
11
|
+
from pixeltable.metadata import schema
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_logger = logging.getLogger('pixeltable')
|
|
15
|
+
|
|
16
|
+
class Dir(SchemaObject):
|
|
17
|
+
def __init__(self, id: UUID, parent_id: UUID, name: str):
|
|
18
|
+
super().__init__(id, name, parent_id)
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def display_name(cls) -> str:
|
|
22
|
+
return 'directory'
|
|
23
|
+
|
|
24
|
+
def move(self, new_name: str, new_dir_id: UUID) -> None:
|
|
25
|
+
super().move(new_name, new_dir_id)
|
|
26
|
+
with Env.get().engine.begin() as conn:
|
|
27
|
+
dir_md = schema.DirMd(name=new_name)
|
|
28
|
+
conn.execute(
|
|
29
|
+
sql.update(schema.Dir.__table__)
|
|
30
|
+
.values({schema.Dir.parent_id: self._dir_id, schema.Dir.md: dataclasses.asdict(dir_md)})
|
|
31
|
+
.where(schema.Dir.id == self._id))
|
|
32
|
+
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import dataclasses
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
_logger = logging.getLogger('pixeltable')
|
|
7
|
+
|
|
8
|
+
# name of the position column in a component view
|
|
9
|
+
POS_COLUMN_NAME = 'pos'
|
|
10
|
+
|
|
11
|
+
@dataclasses.dataclass
|
|
12
|
+
class UpdateStatus:
|
|
13
|
+
num_rows: int = 0
|
|
14
|
+
# TODO: disambiguate what this means: # of slots computed or # of columns computed?
|
|
15
|
+
num_computed_values: int = 0
|
|
16
|
+
num_excs: int = 0
|
|
17
|
+
updated_cols: List[str] = dataclasses.field(default_factory=list)
|
|
18
|
+
cols_with_excs: List[str] = dataclasses.field(default_factory=list)
|
|
19
|
+
|
|
20
|
+
def is_valid_identifier(name: str) -> bool:
|
|
21
|
+
return name.isidentifier() and not name.startswith('_')
|
|
22
|
+
|
|
23
|
+
def is_valid_path(path: str, empty_is_valid : bool) -> bool:
|
|
24
|
+
if path == '':
|
|
25
|
+
return empty_is_valid
|
|
26
|
+
|
|
27
|
+
for part in path.split('.'):
|
|
28
|
+
if not is_valid_identifier(part):
|
|
29
|
+
return False
|
|
30
|
+
return True
|
|
31
|
+
|
|
32
|
+
def is_system_column_name(name: str) -> bool:
|
|
33
|
+
return name == POS_COLUMN_NAME
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional, List, Any, Dict, overload, Iterable
|
|
5
|
+
from uuid import UUID
|
|
6
|
+
|
|
7
|
+
import sqlalchemy.orm as orm
|
|
8
|
+
|
|
9
|
+
import pixeltable
|
|
10
|
+
import pixeltable.type_system as ts
|
|
11
|
+
from pixeltable import exceptions as excs
|
|
12
|
+
from pixeltable.env import Env
|
|
13
|
+
from .catalog import Catalog
|
|
14
|
+
from .globals import UpdateStatus
|
|
15
|
+
from .table import Table
|
|
16
|
+
from .table_version import TableVersion
|
|
17
|
+
from .table_version_path import TableVersionPath
|
|
18
|
+
|
|
19
|
+
_logger = logging.getLogger('pixeltable')
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class InsertableTable(Table):
|
|
23
|
+
"""A `Table` that allows inserting and deleting rows."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, dir_id: UUID, tbl_version: TableVersion):
|
|
26
|
+
tbl_version_path = TableVersionPath(tbl_version)
|
|
27
|
+
super().__init__(tbl_version.id, dir_id, tbl_version.name, tbl_version_path)
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def display_name(cls) -> str:
|
|
31
|
+
return 'table'
|
|
32
|
+
|
|
33
|
+
# MODULE-LOCAL, NOT PUBLIC
|
|
34
|
+
@classmethod
|
|
35
|
+
def create(
|
|
36
|
+
cls, dir_id: UUID, name: str, schema: Dict[str, ts.ColumnType], primary_key: List[str],
|
|
37
|
+
num_retained_versions: int, comment: str
|
|
38
|
+
) -> InsertableTable:
|
|
39
|
+
columns = cls._create_columns(schema)
|
|
40
|
+
cls._verify_schema(columns)
|
|
41
|
+
column_names = [col.name for col in columns]
|
|
42
|
+
for pk_col in primary_key:
|
|
43
|
+
if pk_col not in column_names:
|
|
44
|
+
raise excs.Error(f'Primary key column {pk_col} not found in table schema')
|
|
45
|
+
col = columns[column_names.index(pk_col)]
|
|
46
|
+
if col.col_type.nullable:
|
|
47
|
+
raise excs.Error(f'Primary key column {pk_col} cannot be nullable')
|
|
48
|
+
col.is_pk = True
|
|
49
|
+
|
|
50
|
+
with orm.Session(Env.get().engine, future=True) as session:
|
|
51
|
+
_, tbl_version = TableVersion.create(session, dir_id, name, columns, num_retained_versions, comment)
|
|
52
|
+
tbl = cls(dir_id, tbl_version)
|
|
53
|
+
session.commit()
|
|
54
|
+
cat = Catalog.get()
|
|
55
|
+
cat.tbl_dependents[tbl._id] = []
|
|
56
|
+
cat.tbls[tbl._id] = tbl
|
|
57
|
+
|
|
58
|
+
_logger.info(f'Created table `{name}`, id={tbl_version.id}')
|
|
59
|
+
print(f'Created table `{name}`.')
|
|
60
|
+
return tbl
|
|
61
|
+
|
|
62
|
+
@overload
|
|
63
|
+
def insert(self, rows: Iterable[Dict[str, Any]], /, print_stats: bool = False, fail_on_exception: bool = True): ...
|
|
64
|
+
|
|
65
|
+
@overload
|
|
66
|
+
def insert(self, print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any): ...
|
|
67
|
+
|
|
68
|
+
def insert(self, *args, **kwargs) -> UpdateStatus:
|
|
69
|
+
"""Insert rows into table.
|
|
70
|
+
|
|
71
|
+
To insert multiple rows at a time:
|
|
72
|
+
|
|
73
|
+
``insert(rows: List[Dict[str, Any]], print_stats: bool = False, fail_on_exception: bool = True)``
|
|
74
|
+
|
|
75
|
+
To insert just a single row, you can use the more convenient syntax:
|
|
76
|
+
``insert(print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any)``
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
rows: (if inserting multiple rows) A list of rows to insert, each of which is a dictionary mapping column
|
|
80
|
+
names to values.
|
|
81
|
+
kwargs: (if inserting a single row) keyword-argument pairs representing column names and values.
|
|
82
|
+
print_stats: If ``True``, print statistics about the cost of computed columns.
|
|
83
|
+
fail_on_exception:
|
|
84
|
+
Determines how exceptions in computed columns and invalid media files (e.g., corrupt images)
|
|
85
|
+
are handled.
|
|
86
|
+
If ``False``, store error information (accessible as column properties 'errortype' and 'errormsg')
|
|
87
|
+
for those cases, but continue inserting rows.
|
|
88
|
+
If ``True``, raise an exception that aborts the insert.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
execution status
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
Error: if a row does not match the table schema or contains values for computed columns
|
|
95
|
+
|
|
96
|
+
Examples:
|
|
97
|
+
Insert two rows into a table with three int columns ``a``, ``b``, and ``c``. Column ``c`` is nullable.
|
|
98
|
+
|
|
99
|
+
>>> tbl.insert([{'a': 1, 'b': 1, 'c': 1}, {'a': 2, 'b': 2}])
|
|
100
|
+
|
|
101
|
+
Insert a single row into a table with three int columns ``a``, ``b``, and ``c``.
|
|
102
|
+
|
|
103
|
+
>>> tbl.insert(a=1, b=1, c=1)
|
|
104
|
+
"""
|
|
105
|
+
print_stats = kwargs.pop('print_stats', False)
|
|
106
|
+
fail_on_exception = kwargs.pop('fail_on_exception', True)
|
|
107
|
+
if len(args) > 0:
|
|
108
|
+
# There's a positional argument; this means `rows` is expressed as a
|
|
109
|
+
# list of dicts (multi-insert)
|
|
110
|
+
rows = list(args[0])
|
|
111
|
+
else:
|
|
112
|
+
# No positional argument; this means we're inserting a single row
|
|
113
|
+
# using kwargs syntax
|
|
114
|
+
rows = [kwargs]
|
|
115
|
+
|
|
116
|
+
if not isinstance(rows, list):
|
|
117
|
+
raise excs.Error('rows must be a list of dictionaries')
|
|
118
|
+
if len(rows) == 0:
|
|
119
|
+
raise excs.Error('rows must not be empty')
|
|
120
|
+
for row in rows:
|
|
121
|
+
if not isinstance(row, dict):
|
|
122
|
+
raise excs.Error('rows must be a list of dictionaries')
|
|
123
|
+
self._validate_input_rows(rows)
|
|
124
|
+
result = self.tbl_version.insert(rows, print_stats=print_stats, fail_on_exception=fail_on_exception)
|
|
125
|
+
|
|
126
|
+
if result.num_excs == 0:
|
|
127
|
+
cols_with_excs_str = ''
|
|
128
|
+
else:
|
|
129
|
+
cols_with_excs_str = \
|
|
130
|
+
f' across {len(result.cols_with_excs)} column{"" if len(result.cols_with_excs) == 1 else "s"}'
|
|
131
|
+
cols_with_excs_str += f' ({", ".join(result.cols_with_excs)})'
|
|
132
|
+
msg = (
|
|
133
|
+
f'Inserted {result.num_rows} row{"" if result.num_rows == 1 else "s"} '
|
|
134
|
+
f'with {result.num_excs} error{"" if result.num_excs == 1 else "s"}{cols_with_excs_str}.'
|
|
135
|
+
)
|
|
136
|
+
print(msg)
|
|
137
|
+
_logger.info(f'InsertableTable {self._name}: {msg}')
|
|
138
|
+
return result
|
|
139
|
+
|
|
140
|
+
def _validate_input_rows(self, rows: List[Dict[str, Any]]) -> None:
|
|
141
|
+
"""Verify that the input rows match the table schema"""
|
|
142
|
+
valid_col_names = set(self.column_names())
|
|
143
|
+
reqd_col_names = set(self.tbl_version_path.tbl_version.get_required_col_names())
|
|
144
|
+
computed_col_names = set(self.tbl_version_path.tbl_version.get_computed_col_names())
|
|
145
|
+
for row in rows:
|
|
146
|
+
assert isinstance(row, dict)
|
|
147
|
+
col_names = set(row.keys())
|
|
148
|
+
if len(reqd_col_names - col_names) > 0:
|
|
149
|
+
raise excs.Error(f'Missing required column(s) ({", ".join(reqd_col_names - col_names)}) in row {row}')
|
|
150
|
+
|
|
151
|
+
for col_name, val in row.items():
|
|
152
|
+
if col_name not in valid_col_names:
|
|
153
|
+
raise excs.Error(f'Unknown column name {col_name} in row {row}')
|
|
154
|
+
if col_name in computed_col_names:
|
|
155
|
+
raise excs.Error(f'Value for computed column {col_name} in row {row}')
|
|
156
|
+
|
|
157
|
+
# validate data
|
|
158
|
+
col = self.tbl_version_path.get_column(col_name)
|
|
159
|
+
try:
|
|
160
|
+
# basic sanity checks here
|
|
161
|
+
checked_val = col.col_type.create_literal(val)
|
|
162
|
+
row[col_name] = checked_val
|
|
163
|
+
except TypeError as e:
|
|
164
|
+
msg = str(e)
|
|
165
|
+
raise excs.Error(f'Error in column {col.name}: {msg[0].lower() + msg[1:]}\nRow: {row}')
|
|
166
|
+
|
|
167
|
+
def delete(self, where: Optional['pixeltable.exprs.Predicate'] = None) -> UpdateStatus:
|
|
168
|
+
"""Delete rows in this table.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
where: a Predicate to filter rows to delete.
|
|
172
|
+
|
|
173
|
+
Examples:
|
|
174
|
+
Delete all rows in a table:
|
|
175
|
+
|
|
176
|
+
>>> tbl.delete()
|
|
177
|
+
|
|
178
|
+
Delete all rows in a table where column `a` is greater than 5:
|
|
179
|
+
|
|
180
|
+
>>> tbl.delete(tbl.a > 5)
|
|
181
|
+
"""
|
|
182
|
+
from pixeltable.exprs import Predicate
|
|
183
|
+
from pixeltable.plan import Planner
|
|
184
|
+
if where is not None:
|
|
185
|
+
if not isinstance(where, Predicate):
|
|
186
|
+
raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
|
|
187
|
+
analysis_info = Planner.analyze(self.tbl_version_path, where)
|
|
188
|
+
# for now we require that the updated rows can be identified via SQL, rather than via a Python filter
|
|
189
|
+
if analysis_info.filter is not None:
|
|
190
|
+
raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
|
|
191
|
+
|
|
192
|
+
return self.tbl_version.delete(where)
|