pixeltable 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +15 -33
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +1 -1
- pixeltable/catalog/column.py +28 -16
- pixeltable/catalog/dir.py +2 -2
- pixeltable/catalog/insertable_table.py +5 -55
- pixeltable/catalog/named_function.py +2 -2
- pixeltable/catalog/schema_object.py +2 -7
- pixeltable/catalog/table.py +298 -204
- pixeltable/catalog/table_version.py +104 -139
- pixeltable/catalog/table_version_path.py +22 -4
- pixeltable/catalog/view.py +20 -10
- pixeltable/dataframe.py +128 -25
- pixeltable/env.py +21 -14
- pixeltable/exec/exec_context.py +5 -0
- pixeltable/exec/exec_node.py +1 -0
- pixeltable/exec/in_memory_data_node.py +29 -24
- pixeltable/exec/sql_scan_node.py +1 -1
- pixeltable/exprs/column_ref.py +13 -8
- pixeltable/exprs/data_row.py +4 -0
- pixeltable/exprs/expr.py +16 -1
- pixeltable/exprs/function_call.py +4 -4
- pixeltable/exprs/row_builder.py +29 -20
- pixeltable/exprs/similarity_expr.py +4 -3
- pixeltable/ext/functions/yolox.py +2 -1
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +14 -12
- pixeltable/func/callable_function.py +8 -6
- pixeltable/func/expr_template_function.py +13 -19
- pixeltable/func/function.py +3 -6
- pixeltable/func/query_template_function.py +84 -0
- pixeltable/func/signature.py +68 -23
- pixeltable/func/udf.py +13 -10
- pixeltable/functions/__init__.py +6 -91
- pixeltable/functions/eval.py +26 -14
- pixeltable/functions/fireworks.py +25 -23
- pixeltable/functions/globals.py +62 -0
- pixeltable/functions/huggingface.py +20 -16
- pixeltable/functions/image.py +170 -1
- pixeltable/functions/openai.py +95 -128
- pixeltable/functions/string.py +10 -2
- pixeltable/functions/together.py +95 -84
- pixeltable/functions/util.py +16 -0
- pixeltable/functions/video.py +94 -16
- pixeltable/functions/whisper.py +78 -0
- pixeltable/globals.py +1 -1
- pixeltable/io/__init__.py +10 -0
- pixeltable/io/external_store.py +370 -0
- pixeltable/io/globals.py +50 -22
- pixeltable/{datatransfer → io}/label_studio.py +279 -166
- pixeltable/io/parquet.py +1 -1
- pixeltable/iterators/__init__.py +9 -0
- pixeltable/iterators/string.py +40 -0
- pixeltable/metadata/__init__.py +6 -8
- pixeltable/metadata/converters/convert_10.py +2 -4
- pixeltable/metadata/converters/convert_12.py +7 -2
- pixeltable/metadata/converters/convert_13.py +6 -8
- pixeltable/metadata/converters/convert_14.py +2 -4
- pixeltable/metadata/converters/convert_15.py +40 -25
- pixeltable/metadata/converters/convert_16.py +18 -0
- pixeltable/metadata/converters/util.py +11 -8
- pixeltable/metadata/schema.py +3 -6
- pixeltable/plan.py +8 -7
- pixeltable/store.py +1 -1
- pixeltable/tool/create_test_db_dump.py +145 -54
- pixeltable/tool/embed_udf.py +9 -0
- pixeltable/type_system.py +1 -2
- pixeltable/utils/code.py +34 -0
- {pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/METADATA +2 -2
- pixeltable-0.2.9.dist-info/RECORD +131 -0
- pixeltable/datatransfer/__init__.py +0 -1
- pixeltable/datatransfer/remote.py +0 -113
- pixeltable/functions/pil/image.py +0 -147
- pixeltable-0.2.7.dist-info/RECORD +0 -126
- {pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/WHEEL +0 -0
pixeltable/iterators/__init__.py
CHANGED
|
@@ -1,3 +1,12 @@
|
|
|
1
1
|
from .base import ComponentIterator
|
|
2
2
|
from .document import DocumentSplitter
|
|
3
|
+
from .string import StringSplitter
|
|
3
4
|
from .video import FrameIterator
|
|
5
|
+
|
|
6
|
+
__default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
|
|
7
|
+
__removed_symbols = {'base', 'document', 'video'}
|
|
8
|
+
__all__ = sorted(list(__default_dir - __removed_symbols))
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def __dir__():
|
|
12
|
+
return __all__
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from typing import Iterator, Any
|
|
2
|
+
|
|
3
|
+
import pixeltable.exceptions as excs
|
|
4
|
+
import pixeltable.type_system as ts
|
|
5
|
+
from pixeltable.env import Env
|
|
6
|
+
from pixeltable.iterators.base import ComponentIterator
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class StringSplitter(ComponentIterator):
|
|
10
|
+
# TODO(aaron-siegel): Merge this with `DocumentSplitter` in order to provide additional capabilities.
|
|
11
|
+
def __init__(self, text: str, *, separators: str):
|
|
12
|
+
if separators != 'sentence':
|
|
13
|
+
raise excs.Error('Only `sentence` separators are currently supported.')
|
|
14
|
+
self._text = text
|
|
15
|
+
self.doc = Env.get().spacy_nlp(self._text)
|
|
16
|
+
self.iter = self._iter()
|
|
17
|
+
|
|
18
|
+
def _iter(self) -> Iterator[dict[str, Any]]:
|
|
19
|
+
for sentence in self.doc.sents:
|
|
20
|
+
yield {'text': sentence.text}
|
|
21
|
+
|
|
22
|
+
def __next__(self) -> dict[str, Any]:
|
|
23
|
+
return next(self.iter)
|
|
24
|
+
|
|
25
|
+
def close(self) -> None:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
def set_pos(self, pos: int) -> None:
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
|
|
33
|
+
return {
|
|
34
|
+
'text': ts.StringType(),
|
|
35
|
+
'separators': ts.StringType(),
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
40
|
+
return {'text': ts.StringType()}, []
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 17
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -26,13 +26,11 @@ def create_system_info(engine: sql.engine.Engine) -> None:
|
|
|
26
26
|
# key: old schema version
|
|
27
27
|
converter_cbs: Dict[int, Callable[[sql.engine.Engine], None]] = {}
|
|
28
28
|
|
|
29
|
-
def register_converter(version: int
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
# Converter to use when incrementing the schema version, but without any functional changes
|
|
35
|
-
pass
|
|
29
|
+
def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
|
|
30
|
+
def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
|
|
31
|
+
global converter_cbs
|
|
32
|
+
converter_cbs[version] = fn
|
|
33
|
+
return decorator
|
|
36
34
|
|
|
37
35
|
# load all converter modules
|
|
38
36
|
for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
|
|
@@ -4,7 +4,8 @@ from pixeltable.metadata.schema import Table, TableSchemaVersion
|
|
|
4
4
|
from pixeltable.metadata import register_converter
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
@register_converter(version=10)
|
|
8
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
8
9
|
default_table_attrs = {"comment": None, "num_retained_versions": 10}
|
|
9
10
|
with engine.begin() as conn:
|
|
10
11
|
# Because `parameters` wasn't actually used for anything,
|
|
@@ -13,6 +14,3 @@ def convert_10(engine: sql.engine.Engine) -> None:
|
|
|
13
14
|
# Add `table_attrs` to all instances of tableschemaversions.md.
|
|
14
15
|
conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
|
|
15
16
|
return
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
register_converter(10, convert_10)
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
-
|
|
1
|
+
import sqlalchemy as sql
|
|
2
2
|
|
|
3
|
-
register_converter
|
|
3
|
+
from pixeltable.metadata import register_converter
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@register_converter(version=12)
|
|
7
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
8
|
+
pass
|
|
@@ -9,12 +9,13 @@ from pixeltable.metadata.schema import Table
|
|
|
9
9
|
_logger = logging.getLogger('pixeltable')
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
@register_converter(version=13)
|
|
13
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
13
14
|
with engine.begin() as conn:
|
|
14
15
|
for row in conn.execute(sql.select(Table)):
|
|
15
16
|
id = row[0]
|
|
16
17
|
md = row[2]
|
|
17
|
-
updated_md =
|
|
18
|
+
updated_md = __update_md(md)
|
|
18
19
|
if updated_md != md:
|
|
19
20
|
_logger.info(f'Updating schema for table: {id}')
|
|
20
21
|
conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_md))
|
|
@@ -23,19 +24,16 @@ def convert_13(engine: sql.engine.Engine) -> None:
|
|
|
23
24
|
# Traverse the schema dictionary and replace instances of `ExplicitBatchedFunction` with
|
|
24
25
|
# `CallableFunction`. DB versions prior to 14 can't contain serialized batched functions,
|
|
25
26
|
# so this is all we need to do.
|
|
26
|
-
def
|
|
27
|
+
def __update_md(md: Any) -> Any:
|
|
27
28
|
if isinstance(md, dict):
|
|
28
29
|
updated_md = {}
|
|
29
30
|
for k, v in md.items():
|
|
30
31
|
if k == '_classpath' and v == 'pixeltable.func.batched_function.ExplicitBatchedFunction':
|
|
31
32
|
updated_md[k] = 'pixeltable.func.callable_function.CallableFunction'
|
|
32
33
|
else:
|
|
33
|
-
updated_md[k] =
|
|
34
|
+
updated_md[k] = __update_md(v)
|
|
34
35
|
return updated_md
|
|
35
36
|
elif isinstance(md, list):
|
|
36
|
-
return [
|
|
37
|
+
return [__update_md(v) for v in md]
|
|
37
38
|
else:
|
|
38
39
|
return md
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
register_converter(13, convert_13)
|
|
@@ -4,10 +4,8 @@ from pixeltable.metadata.schema import Table
|
|
|
4
4
|
from pixeltable.metadata import register_converter
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
@register_converter(version=14)
|
|
8
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
8
9
|
default_remotes = {'remotes': []}
|
|
9
10
|
with engine.begin() as conn:
|
|
10
11
|
conn.execute(sql.update(Table).where(Table.md['remotes'] == None).values(md=Table.md.concat(default_remotes)))
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
register_converter(14, convert_14)
|
|
@@ -1,29 +1,44 @@
|
|
|
1
|
-
import uuid
|
|
2
1
|
|
|
2
|
+
import inspect
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import cloudpickle
|
|
3
7
|
import sqlalchemy as sql
|
|
4
8
|
|
|
9
|
+
import pixeltable.func as func
|
|
10
|
+
import pixeltable.type_system as ts
|
|
5
11
|
from pixeltable.metadata import register_converter
|
|
6
|
-
from pixeltable.metadata.
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
12
|
+
from pixeltable.metadata.schema import Function
|
|
13
|
+
|
|
14
|
+
_logger = logging.getLogger('pixeltable')
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@register_converter(version=15)
|
|
18
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
19
|
+
with engine.begin() as conn:
|
|
20
|
+
for row in conn.execute(sql.select(Function)):
|
|
21
|
+
id, dir_id, md, binary_obj = row
|
|
22
|
+
md['md'] = __update_md(md['md'], binary_obj)
|
|
23
|
+
_logger.info(f'Updating function: {id}')
|
|
24
|
+
conn.execute(sql.update(Function).where(Function.id == id).values(md=md))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def __update_md(orig_d: dict, binary_obj: bytes) -> Any:
|
|
28
|
+
# construct dict produced by CallableFunction.to_store()
|
|
29
|
+
py_fn = cloudpickle.loads(binary_obj)
|
|
30
|
+
py_params = inspect.signature(py_fn).parameters
|
|
31
|
+
return_type = ts.ColumnType.from_dict(orig_d['return_type'])
|
|
32
|
+
params: list[func.Parameter] = []
|
|
33
|
+
for name, col_type_dict, kind_int, is_batched in orig_d['parameters']:
|
|
34
|
+
col_type = ts.ColumnType.from_dict(col_type_dict) if col_type_dict is not None else None
|
|
35
|
+
default = py_params[name].default
|
|
36
|
+
kind = inspect._ParameterKind(kind_int) # is there a way to avoid referencing a private type?
|
|
37
|
+
params.append(func.Parameter(name=name, col_type=col_type, kind=kind, default=default, is_batched=is_batched))
|
|
38
|
+
is_batched = 'batch_size' in orig_d
|
|
39
|
+
sig = func.Signature(return_type, params, is_batched=is_batched)
|
|
40
|
+
d = {
|
|
41
|
+
'signature': sig.as_dict(),
|
|
42
|
+
'batch_size': orig_d['batch_size'] if is_batched else None,
|
|
43
|
+
}
|
|
44
|
+
return d
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata import register_converter
|
|
4
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@register_converter(version=16)
|
|
8
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
9
|
+
convert_table_md(
|
|
10
|
+
engine,
|
|
11
|
+
table_md_updater=__update_table_md
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __update_table_md(table_md: dict) -> None:
|
|
16
|
+
# External stores are not migratable; just drop them
|
|
17
|
+
del table_md['remotes']
|
|
18
|
+
table_md['external_stores'] = {}
|
|
@@ -11,8 +11,9 @@ __logger = logging.getLogger('pixeltable')
|
|
|
11
11
|
|
|
12
12
|
def convert_table_md(
|
|
13
13
|
engine: sql.engine.Engine,
|
|
14
|
+
table_md_updater: Optional[Callable[[dict], None]] = None,
|
|
14
15
|
column_md_updater: Optional[Callable[[dict], None]] = None,
|
|
15
|
-
|
|
16
|
+
external_store_md_updater: Optional[Callable[[dict], None]] = None,
|
|
16
17
|
substitution_fn: Optional[Callable[[Any, Any], Optional[tuple[Any, Any]]]] = None
|
|
17
18
|
) -> None:
|
|
18
19
|
with engine.begin() as conn:
|
|
@@ -21,10 +22,12 @@ def convert_table_md(
|
|
|
21
22
|
table_md = row[2]
|
|
22
23
|
assert isinstance(table_md, dict)
|
|
23
24
|
updated_table_md = copy.deepcopy(table_md)
|
|
25
|
+
if table_md_updater is not None:
|
|
26
|
+
table_md_updater(updated_table_md)
|
|
24
27
|
if column_md_updater is not None:
|
|
25
28
|
__update_column_md(updated_table_md, column_md_updater)
|
|
26
|
-
if
|
|
27
|
-
|
|
29
|
+
if external_store_md_updater is not None:
|
|
30
|
+
__update_external_store_md(updated_table_md, external_store_md_updater)
|
|
28
31
|
if substitution_fn is not None:
|
|
29
32
|
updated_table_md = __substitute_md_rec(updated_table_md, substitution_fn)
|
|
30
33
|
if updated_table_md != table_md:
|
|
@@ -39,11 +42,11 @@ def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]
|
|
|
39
42
|
column_md_updater(column_md)
|
|
40
43
|
|
|
41
44
|
|
|
42
|
-
def
|
|
43
|
-
|
|
44
|
-
assert isinstance(
|
|
45
|
-
for
|
|
46
|
-
|
|
45
|
+
def __update_external_store_md(table_md: dict, external_store_md_updater: Callable[[dict], None]) -> None:
|
|
46
|
+
stores_md = table_md['external_stores']
|
|
47
|
+
assert isinstance(stores_md, list)
|
|
48
|
+
for store_md in stores_md:
|
|
49
|
+
external_store_md_updater(store_md)
|
|
47
50
|
|
|
48
51
|
|
|
49
52
|
def __substitute_md_rec(md: Any, substitution_fn: Callable[[Any, Any], Optional[tuple[Any, Any]]]) -> Any:
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -92,9 +92,6 @@ class ColumnMd:
|
|
|
92
92
|
# if True, the column is present in the stored table
|
|
93
93
|
stored: Optional[bool]
|
|
94
94
|
|
|
95
|
-
# if specified, the column is a stored proxy of another column
|
|
96
|
-
proxy_base: Optional[int]
|
|
97
|
-
|
|
98
95
|
|
|
99
96
|
@dataclasses.dataclass
|
|
100
97
|
class IndexMd:
|
|
@@ -145,9 +142,9 @@ class TableMd:
|
|
|
145
142
|
# - every row is assigned a unique and immutable rowid on insertion
|
|
146
143
|
next_row_id: int
|
|
147
144
|
|
|
148
|
-
# Metadata format for
|
|
149
|
-
# {'class': 'pixeltable.
|
|
150
|
-
|
|
145
|
+
# Metadata format for external stores:
|
|
146
|
+
# {'class': 'pixeltable.io.label_studio.LabelStudioProject', 'md': {'project_id': 3}}
|
|
147
|
+
external_stores: list[dict[str, Any]]
|
|
151
148
|
|
|
152
149
|
column_md: dict[int, ColumnMd] # col_id -> ColumnMd
|
|
153
150
|
index_md: dict[int, IndexMd] # index_id -> IndexMd
|
pixeltable/plan.py
CHANGED
|
@@ -217,15 +217,15 @@ class Planner:
|
|
|
217
217
|
plan = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
|
|
218
218
|
|
|
219
219
|
media_input_cols = [info for info in input_col_info if info.col.col_type.is_media_type()]
|
|
220
|
+
if len(media_input_cols) > 0:
|
|
221
|
+
# prefetch external files for all input column refs for validation
|
|
222
|
+
plan = exec.CachePrefetchNode(tbl.id, media_input_cols, plan)
|
|
223
|
+
plan = exec.MediaValidationNode(row_builder, media_input_cols, input=plan)
|
|
220
224
|
|
|
221
|
-
|
|
222
|
-
plan = exec.CachePrefetchNode(tbl.id, media_input_cols, plan)
|
|
223
|
-
plan = exec.MediaValidationNode(row_builder, media_input_cols, input=plan)
|
|
224
|
-
|
|
225
|
-
computed_exprs = row_builder.default_eval_ctx.target_exprs
|
|
225
|
+
computed_exprs = [e for e in row_builder.default_eval_ctx.target_exprs if not isinstance(e, exprs.ColumnRef)]
|
|
226
226
|
if len(computed_exprs) > 0:
|
|
227
227
|
# add an ExprEvalNode when there are exprs to compute
|
|
228
|
-
plan = exec.ExprEvalNode(row_builder, computed_exprs,
|
|
228
|
+
plan = exec.ExprEvalNode(row_builder, computed_exprs, plan.output_exprs, input=plan)
|
|
229
229
|
|
|
230
230
|
plan.set_stored_img_cols(stored_img_col_info)
|
|
231
231
|
plan.set_ctx(
|
|
@@ -355,7 +355,8 @@ class Planner:
|
|
|
355
355
|
# - we can ignore stored non-computed columns because they have a default value that is supplied directly by
|
|
356
356
|
# the store
|
|
357
357
|
target = view.tbl_version # the one we need to populate
|
|
358
|
-
stored_cols = [c for c in target.cols if c.is_stored and (c.is_computed or target.is_iterator_column(c))]
|
|
358
|
+
#stored_cols = [c for c in target.cols if c.is_stored and (c.is_computed or target.is_iterator_column(c))]
|
|
359
|
+
stored_cols = [c for c in target.cols if c.is_stored]
|
|
359
360
|
# 2. for component views: iterator args
|
|
360
361
|
iterator_args = [target.iterator_args] if target.iterator_args is not None else []
|
|
361
362
|
|
pixeltable/store.py
CHANGED
|
@@ -263,7 +263,7 @@ class StoreBase:
|
|
|
263
263
|
number of inserted rows, number of exceptions, set of column ids that have exceptions
|
|
264
264
|
"""
|
|
265
265
|
assert v_min is not None
|
|
266
|
-
exec_plan.ctx.conn
|
|
266
|
+
exec_plan.ctx.set_conn(conn)
|
|
267
267
|
batch_size = 16 # TODO: is this a good batch size?
|
|
268
268
|
# TODO: total?
|
|
269
269
|
num_excs = 0
|
|
@@ -4,6 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
import pathlib
|
|
6
6
|
import subprocess
|
|
7
|
+
from typing import Any
|
|
7
8
|
|
|
8
9
|
import pgserver
|
|
9
10
|
import toml
|
|
@@ -12,8 +13,10 @@ import pixeltable as pxt
|
|
|
12
13
|
import pixeltable.metadata as metadata
|
|
13
14
|
from pixeltable.env import Env
|
|
14
15
|
from pixeltable.func import Batch
|
|
16
|
+
from pixeltable.io.external_store import Project
|
|
17
|
+
from pixeltable.tool import embed_udf
|
|
15
18
|
from pixeltable.type_system import \
|
|
16
|
-
StringType, IntType, FloatType, BoolType, TimestampType, JsonType
|
|
19
|
+
StringType, IntType, FloatType, BoolType, TimestampType, JsonType, ImageType
|
|
17
20
|
|
|
18
21
|
_logger = logging.getLogger('pixeltable')
|
|
19
22
|
|
|
@@ -64,8 +67,7 @@ class Dumper:
|
|
|
64
67
|
with open(info_file, 'w') as info:
|
|
65
68
|
toml.dump(info_dict, info)
|
|
66
69
|
|
|
67
|
-
#
|
|
68
|
-
# every major pixeltable DB feature)
|
|
70
|
+
# Expression types, predicate types, embedding indices, views on views
|
|
69
71
|
def create_tables(self) -> None:
|
|
70
72
|
schema = {
|
|
71
73
|
'c1': StringType(nullable=False),
|
|
@@ -76,29 +78,11 @@ class Dumper:
|
|
|
76
78
|
'c5': TimestampType(nullable=False),
|
|
77
79
|
'c6': JsonType(nullable=False),
|
|
78
80
|
'c7': JsonType(nullable=False),
|
|
81
|
+
'c8': ImageType(nullable=True)
|
|
79
82
|
}
|
|
80
|
-
t = pxt.create_table('
|
|
83
|
+
t = pxt.create_table('base_table', schema, primary_key='c2')
|
|
81
84
|
|
|
82
|
-
|
|
83
|
-
t.add_column(c8=[[1, 2, 3], [4, 5, 6]])
|
|
84
|
-
t.add_column(c9=[['a', 'b', 'c'], ['d', 'e', 'f']])
|
|
85
|
-
t.add_column(c10=[t.c1, [t.c1n, t.c2]])
|
|
86
|
-
t.add_column(c11={'int': 22, 'dict': {'key': 'val'}, 'expr': t.c1})
|
|
87
|
-
|
|
88
|
-
# InPredicate
|
|
89
|
-
t.add_column(isin_1=t.c1.isin(['test string 1', 'test string 2', 'test string 3']))
|
|
90
|
-
t.add_column(isin_2=t.c2.isin([1, 2, 3, 4, 5]))
|
|
91
|
-
t.add_column(isin_3=t.c2.isin(t.c6.f5))
|
|
92
|
-
|
|
93
|
-
# Add columns for .astype converters to ensure they're persisted properly
|
|
94
|
-
t.add_column(c2_as_float=t.c2.astype(FloatType()))
|
|
95
|
-
|
|
96
|
-
# Add columns for .apply
|
|
97
|
-
t.add_column(c2_to_string=t.c2.apply(str))
|
|
98
|
-
t.add_column(c6_to_string=t.c6.apply(json.dumps))
|
|
99
|
-
t.add_column(c6_back_to_json=t.c6_to_string.apply(json.loads))
|
|
100
|
-
|
|
101
|
-
num_rows = 100
|
|
85
|
+
num_rows = 20
|
|
102
86
|
d1 = {
|
|
103
87
|
'f1': 'test string 1',
|
|
104
88
|
'f2': 1,
|
|
@@ -117,9 +101,8 @@ class Dumper:
|
|
|
117
101
|
c3_data = [float(i) for i in range(num_rows)]
|
|
118
102
|
c4_data = [bool(i % 2) for i in range(num_rows)]
|
|
119
103
|
c5_data = [datetime.datetime.now()] * num_rows
|
|
120
|
-
c6_data = [
|
|
121
|
-
|
|
122
|
-
d = {
|
|
104
|
+
c6_data = [
|
|
105
|
+
{
|
|
123
106
|
'f1': f'test string {i}',
|
|
124
107
|
'f2': i,
|
|
125
108
|
'f3': float(i),
|
|
@@ -130,8 +113,8 @@ class Dumper:
|
|
|
130
113
|
'f8': [1.0, 2.0, 3.0, 4.0],
|
|
131
114
|
},
|
|
132
115
|
}
|
|
133
|
-
|
|
134
|
-
|
|
116
|
+
for i in range(num_rows)
|
|
117
|
+
]
|
|
135
118
|
c7_data = [d2] * num_rows
|
|
136
119
|
rows = [
|
|
137
120
|
{
|
|
@@ -143,40 +126,148 @@ class Dumper:
|
|
|
143
126
|
'c5': c5_data[i],
|
|
144
127
|
'c6': c6_data[i],
|
|
145
128
|
'c7': c7_data[i],
|
|
129
|
+
'c8': None
|
|
146
130
|
}
|
|
147
131
|
for i in range(num_rows)
|
|
148
132
|
]
|
|
133
|
+
|
|
134
|
+
self.__add_expr_columns(t, 'base_table')
|
|
149
135
|
t.insert(rows)
|
|
136
|
+
|
|
150
137
|
pxt.create_dir('views')
|
|
151
|
-
|
|
152
|
-
|
|
138
|
+
|
|
139
|
+
# simple view
|
|
140
|
+
v = pxt.create_view('views.view', t, filter=(t.c2 < 50))
|
|
141
|
+
self.__add_expr_columns(v, 'view')
|
|
142
|
+
|
|
143
|
+
# snapshot
|
|
144
|
+
_ = pxt.create_view('views.snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
|
|
145
|
+
|
|
146
|
+
# view of views
|
|
147
|
+
vv = pxt.create_view('views.view_of_views', v, filter=(t.c2 >= 25))
|
|
148
|
+
self.__add_expr_columns(vv, 'view_of_views')
|
|
149
|
+
|
|
150
|
+
# empty view
|
|
153
151
|
e = pxt.create_view('views.empty_view', t, filter=t.c2 == 4171780)
|
|
154
152
|
assert e.count() == 0
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
#
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
# Add remotes
|
|
168
|
-
from pixeltable.datatransfer.remote import MockRemote
|
|
169
|
-
v.link(
|
|
170
|
-
MockRemote('remote', {'int_field': pxt.IntType()}, {'str_field': pxt.StringType()}),
|
|
171
|
-
col_mapping={'test_udf': 'int_field', 'c1': 'str_field'}
|
|
153
|
+
self.__add_expr_columns(e, 'empty_view', include_expensive_functions=True)
|
|
154
|
+
|
|
155
|
+
# Add external stores
|
|
156
|
+
from pixeltable.io.external_store import MockProject
|
|
157
|
+
v._link_external_store(
|
|
158
|
+
MockProject.create(
|
|
159
|
+
v,
|
|
160
|
+
'project',
|
|
161
|
+
{'int_field': pxt.IntType()},
|
|
162
|
+
{'str_field': pxt.StringType()},
|
|
163
|
+
{'view_test_udf': 'int_field', 'c1': 'str_field'}
|
|
164
|
+
)
|
|
172
165
|
)
|
|
173
|
-
# We're just trying to test metadata here, so
|
|
174
|
-
#
|
|
175
|
-
from pixeltable.
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
166
|
+
# We're just trying to test metadata here, so it's ok to link a false Label Studio project.
|
|
167
|
+
# We include a computed image column in order to ensure the creation of a stored proxy.
|
|
168
|
+
from pixeltable.io.label_studio import LabelStudioProject
|
|
169
|
+
col_mapping = Project.validate_columns(
|
|
170
|
+
v, {'str_field': pxt.StringType(), 'img_field': pxt.ImageType()}, {},
|
|
171
|
+
{'view_function_call': 'str_field', 'base_table_image_rot': 'img_field'}
|
|
179
172
|
)
|
|
173
|
+
project = LabelStudioProject('ls_project_0', 4171780, media_import_method='file', col_mapping=col_mapping)
|
|
174
|
+
v._link_external_store(project)
|
|
175
|
+
# Sanity check that the stored proxy column did get created
|
|
176
|
+
assert len(project.stored_proxies) == 1
|
|
177
|
+
assert t.base_table_image_rot.col in project.stored_proxies
|
|
178
|
+
|
|
179
|
+
def __add_expr_columns(self, t: pxt.Table, col_prefix: str, include_expensive_functions=False) -> None:
|
|
180
|
+
def add_column(col_name: str, col_expr: Any) -> None:
|
|
181
|
+
t.add_column(**{f'{col_prefix}_{col_name}': col_expr})
|
|
182
|
+
|
|
183
|
+
# arithmetic_expr
|
|
184
|
+
add_column('plus', t.c2 + 6)
|
|
185
|
+
add_column('minus', t.c2 - 5)
|
|
186
|
+
add_column('times', t.c3 * 1.2)
|
|
187
|
+
add_column('div', t.c3 / 1.7)
|
|
188
|
+
add_column('mod', t.c2 % 11)
|
|
189
|
+
|
|
190
|
+
# array_slice
|
|
191
|
+
add_column('array_slice_1', t.c6[5])
|
|
192
|
+
|
|
193
|
+
# column_property_ref
|
|
194
|
+
add_column('fileurl', t.c8.fileurl)
|
|
195
|
+
add_column('localpath', t.c8.localpath)
|
|
196
|
+
|
|
197
|
+
# comparison
|
|
198
|
+
add_column('lt', t.c2 < t.c3)
|
|
199
|
+
add_column('le', t.c2 <= t.c3)
|
|
200
|
+
add_column('gt', t.c2 > t.c3)
|
|
201
|
+
add_column('ge', t.c2 >= t.c3)
|
|
202
|
+
add_column('ne', t.c2 != t.c3)
|
|
203
|
+
add_column('eq', t.c2 == t.c3)
|
|
204
|
+
|
|
205
|
+
# compound_predicate
|
|
206
|
+
add_column('and', (t.c2 >= 5) & (t.c2 < 8))
|
|
207
|
+
add_column('or', (t.c2 > 1) | t.c4)
|
|
208
|
+
add_column('not', ~(t.c2 > 20))
|
|
209
|
+
|
|
210
|
+
# function_call
|
|
211
|
+
add_column('function_call', pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)) # library function
|
|
212
|
+
add_column('test_udf', test_udf_stored(t.c2)) # stored udf
|
|
213
|
+
add_column('test_udf_batched', test_udf_stored_batched(t.c1, upper=False)) # batched stored udf
|
|
214
|
+
if include_expensive_functions:
|
|
215
|
+
# batched library function
|
|
216
|
+
add_column('batched', pxt.functions.huggingface.clip_text(t.c1, model_id='openai/clip-vit-base-patch32'))
|
|
217
|
+
|
|
218
|
+
# image_member_access
|
|
219
|
+
add_column('image_mode', t.c8.mode)
|
|
220
|
+
add_column('image_rot', t.c8.rotate(180))
|
|
221
|
+
|
|
222
|
+
# in_predicate
|
|
223
|
+
add_column('isin_1', t.c1.isin(['test string 1', 'test string 2', 'test string 3']))
|
|
224
|
+
add_column('isin_2', t.c2.isin([1, 2, 3, 4, 5]))
|
|
225
|
+
add_column('isin_3', t.c2.isin(t.c6.f5))
|
|
226
|
+
|
|
227
|
+
# inline_array and inline_dict
|
|
228
|
+
add_column('inline_array_1', [[1, 2, 3], [4, 5, 6]])
|
|
229
|
+
add_column('inline_array_2', [['a', 'b', 'c'], ['d', 'e', 'f']])
|
|
230
|
+
add_column('inline_list_exprs', [t.c1, [t.c1n, t.c2]])
|
|
231
|
+
add_column('inline_list_mixed', [1, 'a', t.c1, [1, 'a', t.c1n], 1, 'a'])
|
|
232
|
+
add_column('inline_dict', {'int': 22, 'dict': {'key': 'val'}, 'expr': t.c1})
|
|
233
|
+
|
|
234
|
+
# is_null
|
|
235
|
+
add_column('isnull', t.c1 == None)
|
|
236
|
+
|
|
237
|
+
# json_mapper and json_path
|
|
238
|
+
add_column('json_mapper', t.c6[3])
|
|
239
|
+
add_column('json_path', t.c6.f1)
|
|
240
|
+
|
|
241
|
+
# literal
|
|
242
|
+
add_column('str_const', 'str')
|
|
243
|
+
add_column('int_const', 5)
|
|
244
|
+
add_column('float_const', 5.0)
|
|
245
|
+
add_column('timestamp_const_1', datetime.datetime.utcnow())
|
|
246
|
+
add_column('timestamp_const_2', datetime.date.today())
|
|
247
|
+
|
|
248
|
+
# type_cast
|
|
249
|
+
add_column('astype', t.c2.astype(FloatType()))
|
|
250
|
+
|
|
251
|
+
# .apply
|
|
252
|
+
add_column('c2_to_string', t.c2.apply(str))
|
|
253
|
+
add_column('c6_to_string', t.c6.apply(json.dumps))
|
|
254
|
+
add_column('c6_back_to_json', t[f'{col_prefix}_c6_to_string'].apply(json.loads))
|
|
255
|
+
|
|
256
|
+
t.add_embedding_index(f'{col_prefix}_function_call', text_embed=embed_udf.clip_text_embed)
|
|
257
|
+
|
|
258
|
+
# query()
|
|
259
|
+
@t.query
|
|
260
|
+
def q1(i: int):
|
|
261
|
+
# this breaks; TODO: why?
|
|
262
|
+
#return t.where(t.c2 < i)
|
|
263
|
+
return t.where(t.c2 < i).select(t.c1, t.c2)
|
|
264
|
+
add_column('query_output', t.q1(t.c2))
|
|
265
|
+
|
|
266
|
+
@t.query
|
|
267
|
+
def q2(s: str):
|
|
268
|
+
sim = t[f'{col_prefix}_function_call'].similarity(s)
|
|
269
|
+
return t.order_by(sim, asc=False).select(t[f'{col_prefix}_function_call']).limit(5)
|
|
270
|
+
add_column('sim_output', t.q2(t.c1))
|
|
180
271
|
|
|
181
272
|
|
|
182
273
|
@pxt.udf(_force_stored=True)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
import pixeltable as pxt
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# TODO This can go away once we have the ability to inline expr_udf's
|
|
7
|
+
@pxt.expr_udf
|
|
8
|
+
def clip_text_embed(txt: str) -> np.ndarray:
|
|
9
|
+
return pxt.functions.huggingface.clip_text(txt, model_id='openai/clip-vit-base-patch32')
|
pixeltable/type_system.py
CHANGED
|
@@ -160,7 +160,7 @@ class ColumnType:
|
|
|
160
160
|
if t == cls.Type.AUDIO:
|
|
161
161
|
return AudioType()
|
|
162
162
|
if t == cls.Type.DOCUMENT:
|
|
163
|
-
return
|
|
163
|
+
return DocumentType()
|
|
164
164
|
|
|
165
165
|
def __str__(self) -> str:
|
|
166
166
|
return self._type.name.lower()
|
|
@@ -250,7 +250,6 @@ class ColumnType:
|
|
|
250
250
|
return None
|
|
251
251
|
return None
|
|
252
252
|
|
|
253
|
-
|
|
254
253
|
@classmethod
|
|
255
254
|
def from_python_type(cls, t: type) -> Optional[ColumnType]:
|
|
256
255
|
if typing.get_origin(t) is typing.Union:
|